├── .gitattributes ├── .gitignore ├── Makefile ├── README.md ├── app ├── .keep ├── extensions │ ├── .keep │ └── poly1305 │ │ ├── impl.c │ │ ├── poly1305.S │ │ ├── poly1305_armv6-32.inc │ │ ├── poly1305_avx-32.inc │ │ ├── poly1305_avx-64.inc │ │ ├── poly1305_avx2-32.inc │ │ ├── poly1305_avx2-64.inc │ │ ├── poly1305_constants_x86.inc │ │ ├── poly1305_neon-32.inc │ │ ├── poly1305_ref-32.inc │ │ ├── poly1305_ref-64.inc │ │ ├── poly1305_ref-8.inc │ │ ├── poly1305_sse2-32.inc │ │ ├── poly1305_sse2-64.inc │ │ ├── poly1305_x86-32.inc │ │ └── poly1305_x86-64.inc ├── include │ ├── .keep │ └── poly1305.h ├── project.def └── project.ver ├── configure ├── framework ├── bench.c ├── driver │ ├── arm │ │ ├── cpucycles_impl.inc │ │ ├── cpuid_flags.inc │ │ ├── cpuid_impl.inc │ │ ├── cpuid_impl_linux.inc │ │ ├── cpuid_impl_msvc.inc │ │ ├── cpuid_impl_netbsd.inc │ │ └── gcc.inc │ ├── cpucycles.c │ ├── cpuid.c │ ├── gcc_driver.inc │ ├── generic │ │ ├── cpucycles_impl.inc │ │ ├── cpuid_flags.inc │ │ └── cpuid_impl.inc │ ├── x86 │ │ ├── cpucycles_impl.inc │ │ ├── cpuid_flags.inc │ │ ├── cpuid_impl.inc │ │ ├── driver.S │ │ ├── gcc.inc │ │ └── yasm.inc │ └── yasm_driver.inc ├── fuzz.c ├── include │ ├── bench.h │ ├── cpucycles.h │ ├── cpuid.h │ └── fuzz.h ├── main_shared.c └── main_util.c ├── genvs.php └── sources ├── crypto_onetimeauth_poly1305_ref_auth.c ├── crypto_onetimeauth_poly1305_x86_auth.s ├── poly1305-donna-x64-avx2-incremental-source.c ├── poly1305-donna-x64-sse2-incremental-source.c ├── poly1305-donna-x86-avx2-incremental-source.c └── poly1305-donna-x86-sse2-incremental-source.c /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.c text 7 | *.h text 8 | 9 | # Declare files that will always have CRLF line endings on checkout. 10 | *.sln text eol=crlf 11 | 12 | # Denote all files that are truly binary and should not be modified. 13 | *.png binary 14 | *.jpg binary 15 | 16 | # Included assembler files, must be LF 17 | *.inc text eol=lf 18 | 19 | # configure must be LF 20 | configure text eol=lf 21 | 22 | # project files must be LF 23 | project.def text eol=lf 24 | project.ver text eol=lf 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | asmopt.mak 2 | bin/* 3 | build/* 4 | build_util/* 5 | config.log 6 | framework/include/asmopt.h 7 | framework/include/asmopt_internal.h 8 | framework/include/util_implementations.h 9 | example 10 | example-util 11 | !example/ 12 | vs201*/ipch/* 13 | vs201*/*.sdf 14 | vs201*/*.suo 15 | vs201*/*.user 16 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ifeq ($(wildcard asmopt.mak),) 2 | $(error Run ./configure first) 3 | endif 4 | 5 | include asmopt.mak 6 | 7 | ########################## 8 | # set up variables 9 | # 10 | 11 | BASEDIR = . 12 | BINDIR = bin 13 | BUILDDIR = build 14 | BUILDDIRUTIL = build_util 15 | INCLUDE = $(addprefix -I$(BASEDIR)/,$(appdir)/extensions $(appdir)/include framework/include framework/driver framework/driver/$(ARCH)) 16 | CINCLUDE = $(INCLUDE) 17 | ASMINCLUDE = $(INCLUDE) 18 | 19 | # yasm doesn't need includes passed to the assembler 20 | ifneq ($(AS),yasm) 21 | COMMA := , 22 | ASMINCLUDE += $(addprefix -Wa$(COMMA),$(INCLUDE)) 23 | endif 24 | 25 | ########################### 26 | # define recursive wildcard: $(call rwildcard, basepath, globs) 27 | # 28 | rwildcard = $(foreach d, $(wildcard $(1)*), $(call rwildcard, $(d)/, $(2)) $(filter $(subst *, %, $(2)), $(d))) 29 | 30 | SRCDRIVER = $(wildcard framework/driver/*.c) 31 | SRCEXT = $(call rwildcard, $(appdir)/extensions/, *.c) 32 | SRCASM = 33 | SRCMAIN = $(appdir)/main.c 34 | SRCUTIL = framework/main_util.c framework/bench.c framework/fuzz.c 35 | SRCSHARED = framework/main_shared.c 36 | 37 | 38 | # do we have an assembler? 39 | ifeq ($(HAVEAS),yes) 40 | 41 | # grab all the assembler files 42 | SRCASM = $(call rwildcard, $(appdir)/extensions/, *.S) 43 | 44 | # add asm for the appropriate arch 45 | SRCASM += $(call rwildcard, $(addsuffix $(ARCH),framework/driver/), *.S) 46 | 47 | endif 48 | 49 | ########################## 50 | # expand all source file paths in to object files in $(BUILDDIR)/$(BUILDDIRUTIL) 51 | # 52 | OBJDRIVER = $(patsubst %.c, $(BUILDDIR)/%.o, $(SRCDRIVER)) 53 | OBJEXT = $(patsubst %.c, $(BUILDDIR)/%.o, $(SRCEXT)) 54 | OBJASM = $(patsubst %.S, $(BUILDDIR)/%.o, $(SRCASM)) 55 | OBJMAIN = $(patsubst %.c, $(BUILDDIR)/%.o, $(SRCMAIN)) 56 | OBJUTIL = $(patsubst %.c, $(BUILDDIRUTIL)/%.o, $(SRCUTIL)) 57 | OBJEXTUTIL = $(patsubst %.c, $(BUILDDIRUTIL)/%.o, $(SRCEXT)) 58 | OBJSHARED = $(patsubst %.c, $(BUILDDIR)/%.o, $(SRCSHARED)) 59 | 60 | ########################## 61 | # non-file targets 62 | # 63 | .PHONY: all 64 | .PHONY: default 65 | .PHONY: makebin 66 | .PHONY: exe 67 | .PHONY: lib 68 | .PHONY: shared 69 | .PHONY: util 70 | 71 | .PHONY: install-shared 72 | .PHONY: install-generic 73 | .PHONY: install-lib 74 | .PHONY: uninstall 75 | 76 | .PHONY: clean 77 | .PHONY: distclean 78 | 79 | 80 | all: default 81 | 82 | default: lib 83 | 84 | makebin: 85 | @mkdir -p $(BINDIR) 86 | 87 | exe: makebin $(BINDIR)/$(PROJECTNAME)$(EXE) 88 | @echo built [$(BINDIR)/$(PROJECTNAME)$(EXE)] 89 | 90 | install-generic: 91 | $(INSTALL) -d $(includedir)/lib$(PROJECTNAME) 92 | $(INSTALL) -d $(libdir) 93 | $(INSTALL) -m 644 $(appdir)/include/*.h $(includedir)/lib$(PROJECTNAME) 94 | 95 | lib: makebin $(BINDIR)/$(PROJECTNAME)$(STATICLIB) 96 | @echo built [$(BINDIR)/$(PROJECTNAME)$(STATICLIB)] 97 | 98 | install-lib: lib install-generic 99 | $(INSTALL) -m 644 $(BINDIR)/$(PROJECTNAME)$(STATICLIB) $(libdir) 100 | $(if $(RANLIB), $(RANLIB) $(libdir)/$(PROJECTNAME)$(STATICLIB)) 101 | 102 | util: makebin $(BINDIR)/$(PROJECTNAME)-util$(EXE) 103 | @echo built [$(BINDIR)/$(PROJECTNAME)-util$(EXE)] 104 | 105 | ifeq ($(HAVESHARED),yes) 106 | shared: makebin $(BINDIR)/$(SONAME) 107 | @echo built [$(BINDIR)/$(SONAME)] 108 | 109 | install-shared: shared install-generic 110 | ifneq ($(SOIMPORT),) 111 | $(INSTALL) -d $(bindir) 112 | $(INSTALL) -m 755 $(BINDIR)/$(SONAME) $(bindir) 113 | $(INSTALL) -m 644 $(BINDIR)/$(SOIMPORT) $(libdir) 114 | else ifneq ($(SONAME),) 115 | $(INSTALL) -m 755 $(BINDIR)/$(SONAME) $(libdir) 116 | ln -f -s $(libdir)/$(SONAME) $(libdir)/lib$(PROJECTNAME).$(SOSUFFIX) 117 | endif 118 | else 119 | shared: 120 | @echo project must be /configured with --pic 121 | 122 | install-shared: 123 | @echo project must be /configured with --pic 124 | endif # HAVESHARED 125 | 126 | uninstall: 127 | rm -rf $(includedir)/lib$(PROJECTNAME) 128 | rm -f $(libdir)/$(PROJECTNAME)$(STATICLIB) 129 | ifneq ($(SOIMPORT),) 130 | rm -f $(bindir)/$(SONAME) $(libdir)/lib$(SOIMPORT) 131 | else ifneq ($(SONAME),) 132 | rm -f $(libdir)/$(SONAME) $(libdir)/lib$(PROJECTNAME).$(SOSUFFIX) 133 | endif 134 | 135 | clean: 136 | @echo cleaning project [$(PROJECTNAME)] 137 | @rm -rf $(BUILDDIR)/* 138 | @rm -rf $(BUILDDIRUTIL)/* 139 | @rm -rf $(BINDIR)/* 140 | 141 | distclean: clean 142 | @rm asmopt.mak 143 | @rm config.log 144 | 145 | ########################## 146 | # build rules for files 147 | # 148 | 149 | # use $(BASEOBJ) in build rules to grab the base path/name of the object file, without an extension 150 | BASEOBJ = $(BUILDDIR)/$* 151 | BASEOBJUTIL = $(BUILDDIRUTIL)/$* 152 | 153 | # building .S (assembler) files 154 | $(BUILDDIR)/%.o: %.S 155 | @mkdir -p $(dir $@) 156 | # yasm needs one pass to compile, and one to generate dependencies 157 | ifeq ($(AS),yasm) 158 | $(AS) $(ASFLAGS) $(ASMINCLUDE) -o $@ $< 159 | @$(AS) $(ASFLAGS) $(ASMINCLUDE) -o $@ -M $< >$(BASEOBJ).temp 160 | else 161 | $(AS) $(ASFLAGS) $(ASMINCLUDE) $(DEPMM) $(DEPMF) $(BASEOBJ).temp -D BUILDING_ASM -c -o $(BASEOBJ).o $< 162 | endif 163 | @cp $(BASEOBJ).temp $(BASEOBJ).P 164 | @sed \ 165 | -e 's/^[^:]*: *//' \ 166 | -e 's/ *\\$$//' \ 167 | -e '/^$$/ d' \ 168 | -e 's/$$/ :/' \ 169 | < $(BASEOBJ).temp >> $(BASEOBJ).P 170 | @rm -f $(BASEOBJ).temp 171 | 172 | # building .c (C) files 173 | $(BUILDDIR)/%.o: %.c 174 | @mkdir -p $(dir $@) 175 | $(CC) $(CFLAGS) $(CINCLUDE) $(DEPMM) $(DEPMF) $(BASEOBJ).temp -c -o $(BASEOBJ).o $< 176 | @cp $(BASEOBJ).temp $(BASEOBJ).P 177 | @sed \ 178 | -e 's/#.*//' \ 179 | -e 's/^[^:]*: *//' \ 180 | -e 's/ *\\$$//' \ 181 | -e '/^$$/ d' \ 182 | -e 's/$$/ :/' \ 183 | < $(BASEOBJ).temp >> $(BASEOBJ).P 184 | @rm -f $(BASEOBJ).temp 185 | 186 | # building .c (C) files for fuzzing/benching 187 | $(BUILDDIRUTIL)/%.o: %.c 188 | @mkdir -p $(dir $@) 189 | $(CC) $(CFLAGS) $(CINCLUDE) $(DEPMM) $(DEPMF) $(BASEOBJUTIL).temp -DUTILITIES -c -o $(BASEOBJUTIL).o $< 190 | @cp $(BASEOBJUTIL).temp $(BASEOBJUTIL).P 191 | @sed \ 192 | -e 's/#.*//' \ 193 | -e 's/^[^:]*: *//' \ 194 | -e 's/ *\\$$//' \ 195 | -e '/^$$/ d' \ 196 | -e 's/$$/ :/' \ 197 | < $(BASEOBJUTIL).temp >> $(BASEOBJUTIL).P 198 | @rm -f $(BASEOBJUTIL).temp 199 | 200 | 201 | ########################## 202 | # include all auto-generated dependencies 203 | # 204 | 205 | -include $(OBJDRIVER:%.o=%.P) 206 | -include $(OBJEXT:%.o=%.P) 207 | -include $(OBJASM:%.o=%.P) 208 | -include $(OBJMAIN:%.o=%.P) 209 | -include $(OBJUTIL:%.o=%.P) 210 | -include $(OBJEXTUTIL:%.o=%.P) 211 | -include $(OBJSHARED:%.o=%.P) 212 | 213 | ########################## 214 | # final build targets 215 | # 216 | $(BINDIR)/$(PROJECTNAME)$(EXE): $(OBJDRIVER) $(OBJEXT) $(OBJASM) $(OBJMAIN) 217 | $(CC) $(CFLAGS) -o $@ $(OBJDRIVER) $(OBJEXT) $(OBJASM) $(OBJMAIN) 218 | 219 | $(BINDIR)/$(PROJECTNAME)$(STATICLIB): $(OBJDRIVER) $(OBJEXT) $(OBJASM) 220 | rm -f $(PROJECTNAME)$(STATICLIB) 221 | $(AR)$@ $(OBJDRIVER) $(OBJEXT) $(OBJASM) 222 | $(if $(RANLIB), $(RANLIB) $@) 223 | 224 | $(BINDIR)/$(PROJECTNAME)-util$(EXE): $(OBJDRIVER) $(OBJEXTUTIL) $(OBJASM) $(OBJUTIL) 225 | $(CC) $(CFLAGS) -o $@ $(OBJDRIVER) $(OBJEXTUTIL) $(OBJASM) $(OBJUTIL) 226 | 227 | ifeq ($(HAVESHARED),yes) 228 | $(BINDIR)/$(SONAME): $(OBJDRIVER) $(OBJEXT) $(OBJASM) $(OBJSHARED) 229 | $(LD)$@ $(OBJDRIVER) $(OBJEXT) $(OBJASM) $(OBJSHARED) $(SOFLAGS) $(LDFLAGS) 230 | endif 231 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ABOUT # 2 | 3 | This is a portable, performant implementation of [Poly1305](http://cr.yp.to/mac.html), a "secret-key message-authentication code suitable for a wide variety of applications". 4 | 5 | All assembler is PIC safe. 6 | 7 | # INITIALIZING # 8 | 9 | The library can be initialized, i.e. the most optimized implementation that passes internal tests will be automatically selected, in two ways, **neither of which are thread safe**: 10 | 11 | 1. `int poly1305_startup(void);` explicitly initializes the library, and returns a non-zero value if no suitable implementation is found that passes internal tests 12 | 13 | 2. Do nothing and use the library like normal. It will auto-initialize itself when needed, and hard exit if no suitable implementation is found. 14 | 15 | # CALLING # 16 | 17 | Common assumptions: 18 | 19 | * When using the incremental functions, the `poly1305_state` struct is assumed to be word aligned, if necessary, for the system in use. 20 | 21 | ## ONE SHOT ## 22 | 23 | `in` is assumed to be word aligned. Incremental support has no alignment requirements, but will obviously slow down if non word-aligned pointers are passed. 24 | 25 | `void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key);` 26 | 27 | Creates an authentictor in `mac` under the key `key` with `inlen` bytes from `in`. 28 | 29 | ## INCREMENTAL ## 30 | 31 | Incremental `in` buffers are *not* required to be word aligned. Unaligned buffers will require copying to aligned buffers however, which will obviously incur a speed penalty. 32 | 33 | `void poly1305_init(poly1305_state *S, const poly1305_key *key)` 34 | 35 | Initializes `S` with the key `key`. 36 | 37 | `void poly1305_init_ext(poly1305_state *S, const poly1305_key *key, size_t bytes_hint)` 38 | 39 | Initializes `S` with the key `key`, and the hint that no more than `bytes_hint` will be authenticated. If more than `bytes_hint` bytes are passed, in total, the result _may_ be undefined. 40 | 41 | `void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen)` 42 | 43 | Updates the state `S` with `inlen` bytes from `in` in. 44 | 45 | `void poly1305_finish(poly1305_state *S, unsigned char *mac)` 46 | 47 | Performs any finalizations on `S` and store the resulting authentictor in to `mac`. 48 | 49 | # Examples # 50 | 51 | ## AUTHENTICATING DATA WITH ONE CALL ## 52 | 53 | size_t bytes = ...; 54 | unsigned char data[...] = {...}; 55 | poly1305_key key = {{...}}; 56 | unsigned char mac[16]; 57 | 58 | poly1305_auth(mac, data, bytes, &key); 59 | 60 | ## HASHING INCREMENTALLY ## 61 | 62 | Hashing incrementally, i.e. with multiple calls to update the state. 63 | 64 | size_t bytes = ...; 65 | unsigned char data[...] = {...}; 66 | poly1305_key key = {{...}}; 67 | unsigned char mac[16]; 68 | poly1305_state state; 69 | size_t i; 70 | 71 | poly1305_init(&state, &key); 72 | /* add one byte at a time, extremely inefficient */ 73 | for (i = 0; i < bytes; i++) { 74 | poly1305_update(&state, data + i, 1); 75 | } 76 | poly1305_finish(&state, mac); 77 | 78 | 79 | # VERSIONS # 80 | 81 | ## Reference ## 82 | 83 | There are 3 reference versions, specialized for increasingly capable systems from 8 bit-ish only operations (with the world's most inefficient portable carries, you really don't want to use this unless nothing else runs) to 64 bit. 84 | 85 | * Generic 8-bit-ish: [poly1305\_ref](app/extensions/poly1305/poly1305_ref-8.inc) 86 | * Generic 32-bit with 64-bit compiler support: [poly1305\_ref](app/extensions/poly1305/poly1305_ref-32.inc) 87 | * Generic 64-bit: [poly1305\_ref](app/extensions/poly1305/poly1305_ref-64.inc) 88 | 89 | ## x86 (32 bit) ## 90 | 91 | * 386 compatible: [poly1305\_x86](app/extensions/poly1305/poly1305_x86-32.inc) 92 | * SSE2: [poly1305\_sse2](app/extensions/poly1305/poly1305_sse2-32.inc) 93 | * AVX: [poly1305\_avx](app/extensions/poly1305/poly1305_avx-32.inc) 94 | * AVX2: [poly1305\_avx2](app/extensions/poly1305/poly1305_avx2-32.inc) 95 | 96 | The 386 compatible version is a modified version of djb's floating point public domain implementation. 97 | 98 | SSE2, AVX, and AVX2 versions of the one-shot version `poly1305_auth` will revert to the 386 compatible version if the number of bytes is below a certain threshhold. 99 | 100 | ## x86-64 ## 101 | 102 | * x86-64 compatible: [poly1305\_x86](app/extensions/poly1305/poly1305_x86-64.inc) 103 | * SSE2: [poly1305\_sse2](app/extensions/poly1305/poly1305_sse2-64.inc) 104 | * AVX: [poly1305\_avx](app/extensions/poly1305/poly1305_avx-64.inc) 105 | * AVX2: [poly1305\_avx2](app/extensions/poly1305/poly1305_avx2-64.inc) 106 | 107 | SSE2, AVX, and AVX2 versions of the one-shot version `poly1305_auth` will revert to the x86-64 compatible version if the number of bytes is below a certain threshhold. 108 | 109 | The x86-64 compatible version is _only_ included for short messages. It is thoroughly beaten by SIMD versions above 64-128 bytes. 110 | 111 | ## ARM ## 112 | 113 | * ARMv6: [poly1305\_armv6](app/extensions/poly1305/poly1305_armv6-32.inc) 114 | * NEON: [poly1305\_neon](app/extensions/poly1305/poly1305_neon-32.inc) 115 | 116 | NEON versions of the one-shot version `poly1305_auth` will revert to the ARMv6 version if the number of bytes is below a certain threshhold. 117 | 118 | 119 | 120 | # BUILDING # 121 | 122 | See [asm-opt#configuring](https://github.com/floodyberry/asm-opt#configuring) for full configure options. 123 | 124 | If you would like to use Yasm with a gcc-compatible compiler, pass `--yasm` to configure. 125 | 126 | The Visual Studio projects are generated assuming Yasm is available. You will need to have [Yasm.exe](http://yasm.tortall.net/Download.html) somewhere in your path to build them. 127 | 128 | ## STATIC LIBRARY ## 129 | 130 | ./configure 131 | make lib 132 | 133 | and `make install-lib` OR copy `bin/poly1305.lib` and `app/include/poly1305.h` to your desired location. 134 | 135 | ## SHARED LIBRARY ## 136 | 137 | ./configure --pic 138 | make shared 139 | make install-shared 140 | 141 | ## UTILITIES / TESTING ## 142 | 143 | ./configure 144 | make util 145 | bin/poly1305-util [bench|fuzz] 146 | 147 | ### BENCHMARK / TESTING ### 148 | 149 | Benchmarking will implicitly test every available version. If any fail, it will exit with an error indicating which versions did not pass. Features tested include: 150 | 151 | * One-shot and Incremental authentication 152 | * Results above 2^130 - 5 are properly normalized 153 | * All potential block sizes in the underlying implementation are triggered 154 | 155 | ### FUZZING ### 156 | 157 | Fuzzing tests every available implementation for the current CPU against the reference implementation. Features tested are: 158 | 159 | * One-shot and Incremental authentication 160 | 161 | # BENCHMARKS # 162 | 163 | Only the top 3 benchmarks per mode will be shown. Anything past 3 or so is pretty irrelevant to the current architecture. 164 | 165 | ## [E5200](http://ark.intel.com/products/37212/) ## 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 |
Implemenation1 byte64 bytes576 bytes8192 bytes
SSE2-64 158 4.70 2.22 1.53
SSE2-32 275 7.42 2.54 1.80
x86-64 158 4.74 3.44 3.30
x86-32 275 7.08 3.74 3.33
176 | 177 | 178 | ## [i7-4770K](http://ark.intel.com/products/75123) ## 179 | 180 | Timings are with Turbo Boost and Hyperthreading, so their accuracy is not concrete. 181 | For reference, OpenSSL and Crypto++ give ~0.8cpb for AES-128-CTR and ~1.1cpb for AES-256-CTR, ~7.4cpb for SHA-512, and ~4.5cpb for MD5. 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 |
Implemenation1 byte64 bytes576 bytes8192 bytes
AVX2-64 110 3.22 0.96 0.60
AVX2-32 223 4.37 1.15 0.67
AVX-64 110 3.22 1.39 1.06
AVX-32 223 4.37 1.51 1.04
SSE2-64 110 3.22 1.43 1.12
SSE2-32 223 4.33 1.55 1.10
194 | 195 | ## AMD FX-8120 ## 196 | 197 | Timings are with Turbo on, so accuracy is not concrete. I'm not sure how to adjust for it either, 198 | and depending on clock speed (3.1ghz vs 4.0ghz), OpenSSL gives between 0.73cpb - 0.94cpb for AES-128-CTR, 199 | 1.03cpb - 1.33cpb for AES-256-CTR, 10.96cpb - 14.1cpb for SHA-512, and 4.7cpb - 5.16cpb for MD5. 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 |
Implemenation1 byte64 bytes576 bytes8192 bytes
AVX-64 175 5.27 1.35 0.80
SSE2-64 175 5.36 1.47 0.88
AVX-32 319 5.72 1.85 1.19
SSE2-32 320 5.78 1.94 1.31
x86-32 313 8.00 3.62 2.99
x86-64 175 5.30 4.03 3.83
212 | 213 | ## ZedBoard (Cortex-A9) ## 214 | 215 | I don't have access to the cycle counter yet, so cycles are computed by taking the microseconds times the clock speed (666mhz) divided by 1 million. For comparison, on long messages, OpenSSL 1.0.0e gives 52.3 cpb for aes-128-cbc (woof), ~123cpb for SHA-512 (really woof), and ~9.6cpb for MD5. 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 |
Implemenation1 byte64 bytes576 bytes8192 bytes
Neon-32 290 9.53 3.33 2.26
ARMv6-32 290 9.53 6.99 6.73
224 | 225 | 226 | # LICENSE # 227 | 228 | Public Domain, or MIT -------------------------------------------------------------------------------- /app/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floodyberry/poly1305-opt/700d5cf167441f627d76c845f56b7ea72bdd91e8/app/.keep -------------------------------------------------------------------------------- /app/extensions/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floodyberry/poly1305-opt/700d5cf167441f627d76c845f56b7ea72bdd91e8/app/extensions/.keep -------------------------------------------------------------------------------- /app/extensions/poly1305/impl.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "cpuid.h" 5 | #include "poly1305.h" 6 | 7 | typedef struct poly1305_state_internal_t { 8 | unsigned char opaque[192]; /* largest state required (AVX2) */ 9 | size_t leftover, block_size; 10 | unsigned char buffer[64]; /* largest blocksize (AVX2) */ 11 | } poly1305_state_internal; 12 | 13 | typedef struct poly1305_impl_t { 14 | unsigned long cpu_flags; 15 | const char *desc; 16 | 17 | size_t (*block_size)(void); 18 | void (*init_ext)(void *state, const poly1305_key *key, size_t bytes_hint); 19 | void (*blocks)(void *state, const unsigned char *in, size_t inlen); 20 | void (*finish_ext)(void *state, const unsigned char *in, size_t remaining, unsigned char *mac); 21 | void (*auth)(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key); 22 | } poly1305_impl_t; 23 | 24 | #define POLY1305_DECLARE(ext) \ 25 | size_t poly1305_block_size_##ext(void); \ 26 | void poly1305_init_ext_##ext(void *state, const poly1305_key *key, size_t bytes_hint); \ 27 | void poly1305_blocks_##ext(void *state, const unsigned char *in, size_t inlen); \ 28 | void poly1305_finish_ext_##ext(void *state, const unsigned char *in, size_t remaining, unsigned char *mac); \ 29 | void poly1305_auth_##ext(unsigned char *mac, const unsigned char *m, size_t inlen, const poly1305_key *key); 30 | 31 | #define POLY1305_IMPL(cpuflags, desc, ext) \ 32 | {(cpuflags), desc, poly1305_block_size_##ext, poly1305_init_ext_##ext, poly1305_blocks_##ext, poly1305_finish_ext_##ext, poly1305_auth_##ext} 33 | 34 | #if defined(ARCH_X86) 35 | /* 32 bit only implementations */ 36 | #if defined(CPU_32BITS) 37 | #endif 38 | 39 | /* 64 bit only implementations */ 40 | #if defined(CPU_64BITS) 41 | #endif 42 | 43 | /* both 32 and 64 bits */ 44 | POLY1305_DECLARE(x86) 45 | #define POLY1305_X86 POLY1305_IMPL(CPUID_X86, "x86", x86) 46 | 47 | #if defined(HAVE_SSE2) 48 | POLY1305_DECLARE(sse2) 49 | #define POLY1305_SSE2 POLY1305_IMPL(CPUID_SSE2, "sse2", sse2) 50 | #endif 51 | 52 | #if defined(HAVE_AVX) 53 | POLY1305_DECLARE(avx) 54 | #define POLY1305_AVX POLY1305_IMPL(CPUID_AVX, "avx", avx) 55 | #endif 56 | 57 | #if defined(HAVE_AVX2) 58 | POLY1305_DECLARE(avx2) 59 | #define POLY1305_AVX2 POLY1305_IMPL(CPUID_AVX2, "avx2", avx2) 60 | #endif 61 | #endif 62 | 63 | #if defined(ARCH_ARM) 64 | #if defined(HAVE_ARMv6) 65 | POLY1305_DECLARE(armv6) 66 | #define POLY1305_ARMv6 POLY1305_IMPL(CPUID_ARMv6, "armv6", armv6) 67 | #endif 68 | 69 | #if defined(HAVE_NEON) 70 | POLY1305_DECLARE(neon) 71 | #define POLY1305_NEON POLY1305_IMPL(CPUID_NEON, "neon", neon) 72 | #endif 73 | #endif 74 | 75 | /* the "always runs" version */ 76 | #if defined(HAVE_INT64) && defined(HAVE_INT128) 77 | #define POLY1305_GENERIC POLY1305_IMPL(CPUID_GENERIC, "generic/64", ref) 78 | #include "poly1305/poly1305_ref-64.inc" 79 | #elif defined(HAVE_INT32) && defined(HAVE_INT64) 80 | #define POLY1305_GENERIC POLY1305_IMPL(CPUID_GENERIC, "generic/32", ref) 81 | #include "poly1305/poly1305_ref-32.inc" 82 | #else 83 | #define POLY1305_GENERIC POLY1305_IMPL(CPUID_GENERIC, "generic/8", ref) 84 | #include "poly1305/poly1305_ref-8.inc" 85 | #endif 86 | 87 | /* list implemenations from most optimized to least, with generic as the last entry */ 88 | static const poly1305_impl_t poly1305_list[] = { 89 | /* x86 */ 90 | #if defined(POLY1305_AVX2) 91 | POLY1305_AVX2, 92 | #endif 93 | #if defined(POLY1305_AVX) 94 | POLY1305_AVX, 95 | #endif 96 | #if defined(POLY1305_SSE2) 97 | POLY1305_SSE2, 98 | #endif 99 | #if defined(POLY1305_X86) 100 | POLY1305_X86, 101 | #endif 102 | 103 | /* arm */ 104 | #if defined(POLY1305_NEON) 105 | POLY1305_NEON, 106 | #endif 107 | #if defined(POLY1305_ARMv6) 108 | POLY1305_ARMv6, 109 | #endif 110 | 111 | POLY1305_GENERIC 112 | }; 113 | 114 | POLY1305_DECLARE(bootup) 115 | 116 | static const poly1305_impl_t poly1305_bootup_impl = POLY1305_IMPL(CPUID_GENERIC, "bootup", bootup); 117 | static const poly1305_impl_t *poly1305_opt = &poly1305_bootup_impl; 118 | 119 | /* is the pointer aligned on a word boundary? */ 120 | static int 121 | poly1305_is_aligned(const void *p) { 122 | return ((size_t)p & (sizeof(size_t) - 1)) == 0; 123 | } 124 | 125 | /* processes inlen bytes (full blocks only), handling input alignment */ 126 | static void 127 | poly1305_consume(poly1305_state_internal *state, const unsigned char *in, size_t inlen) { 128 | int in_aligned; 129 | 130 | /* it's ok to call with 0 bytes */ 131 | if (!inlen) 132 | return; 133 | 134 | /* if everything is aligned, handle directly */ 135 | in_aligned = poly1305_is_aligned(in); 136 | if (in_aligned) { 137 | poly1305_opt->blocks(state->opaque, in, inlen); 138 | return; 139 | } 140 | 141 | /* copy the unaligned data to an aligned buffer and process in chunks */ 142 | while (inlen) { 143 | unsigned char buffer[1024]; 144 | const size_t bytes = (inlen > sizeof(buffer)) ? sizeof(buffer) : inlen; 145 | memcpy(buffer, in, bytes); 146 | poly1305_opt->blocks(state->opaque, buffer, bytes); 147 | in += bytes; 148 | inlen -= bytes; 149 | } 150 | } 151 | 152 | 153 | LIB_PUBLIC void 154 | poly1305_init(poly1305_state *S, const poly1305_key *key) { 155 | poly1305_state_internal *state = (poly1305_state_internal *)S; 156 | poly1305_opt->init_ext(state->opaque, key, 0); 157 | state->leftover = 0; 158 | state->block_size = poly1305_opt->block_size(); 159 | } 160 | 161 | LIB_PUBLIC void 162 | poly1305_init_ext(poly1305_state *S, const poly1305_key *key, size_t bytes_hint) { 163 | poly1305_state_internal *state = (poly1305_state_internal *)S; 164 | poly1305_opt->init_ext(state->opaque, key, bytes_hint); 165 | state->leftover = 0; 166 | state->block_size = poly1305_opt->block_size(); 167 | } 168 | 169 | LIB_PUBLIC void 170 | poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen) { 171 | poly1305_state_internal *state = (poly1305_state_internal *)S; 172 | 173 | /* handle leftover */ 174 | if (state->leftover) { 175 | size_t want = (state->block_size - state->leftover); 176 | if (want > inlen) 177 | want = inlen; 178 | memcpy(state->buffer + state->leftover, in, want); 179 | inlen -= want; 180 | in += want; 181 | state->leftover += want; 182 | if (state->leftover < state->block_size) 183 | return; 184 | poly1305_opt->blocks(state->opaque, state->buffer, state->block_size); 185 | state->leftover = 0; 186 | } 187 | 188 | /* process full blocks */ 189 | if (inlen >= state->block_size) { 190 | size_t want = (inlen & ~(state->block_size - 1)); 191 | poly1305_consume(state, in, want); 192 | in += want; 193 | inlen -= want; 194 | } 195 | 196 | /* store leftover */ 197 | if (inlen) { 198 | memcpy(state->buffer + state->leftover, in, inlen); 199 | state->leftover += inlen; 200 | } 201 | } 202 | 203 | LIB_PUBLIC void 204 | poly1305_finish(poly1305_state *S, unsigned char *mac) { 205 | poly1305_state_internal *state = (poly1305_state_internal *)S; 206 | poly1305_opt->finish_ext(state->opaque, state->buffer, state->leftover, mac); 207 | } 208 | 209 | LIB_PUBLIC void 210 | poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key) { 211 | poly1305_opt->auth(mac, in, inlen, key); 212 | } 213 | 214 | /* does an incremental mac as well as a one pass and verifies they all match */ 215 | static int 216 | poly1305_auth_test(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key) { 217 | poly1305_state st; 218 | unsigned char mac2[16]; 219 | size_t block_size = poly1305_opt->block_size(); 220 | 221 | /* one pass */ 222 | poly1305_auth(mac, in, inlen, key); 223 | 224 | /* incremental one pass */ 225 | poly1305_init_ext(&st, key, inlen); 226 | poly1305_update(&st, in, inlen); 227 | poly1305_finish(&st, mac2); 228 | 229 | /* make sure they match */ 230 | if (memcmp(mac, mac2, 16) != 0) { 231 | memset(mac, 0, 16); 232 | return 1; 233 | } 234 | 235 | /* incremental multi-pass. SSE2/AVX/AVX2 can support up to a 64 byte block size, so try all possible block sizes (64, 32, 16) */ 236 | poly1305_init(&st, key); 237 | 238 | /* do the native block size first to prime the state */ 239 | if (inlen >= block_size) { poly1305_update(&st, in, block_size); in += block_size; inlen -= block_size; } 240 | 241 | /* try 64 down to 16 */ 242 | if (inlen >= 64) { poly1305_update(&st, in, 64); in += 64; inlen -= 64; } 243 | if (inlen >= 32) { poly1305_update(&st, in, 32); in += 32; inlen -= 32; } 244 | if (inlen >= 16) { poly1305_update(&st, in, 16); in += 16; inlen -= 16; } 245 | if (inlen > 0) { poly1305_update(&st, in, inlen); } 246 | poly1305_finish(&st, mac2); 247 | 248 | /* make sure they match */ 249 | if (memcmp(mac, mac2, 16) != 0) { 250 | memset(mac, 0, 16); 251 | return 1; 252 | } 253 | 254 | return 0; 255 | } 256 | 257 | static int 258 | poly1305_test_impl(const void *impl) { 259 | /* example from nacl */ 260 | static const poly1305_key nacl_key = {{ 261 | 0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91, 262 | 0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25, 263 | 0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65, 264 | 0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80, 265 | }}; 266 | 267 | static const unsigned char nacl_msg[131] = { 268 | 0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73, 269 | 0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce, 270 | 0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4, 271 | 0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a, 272 | 0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b, 273 | 0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72, 274 | 0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2, 275 | 0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38, 276 | 0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a, 277 | 0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae, 278 | 0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea, 279 | 0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda, 280 | 0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde, 281 | 0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3, 282 | 0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6, 283 | 0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74, 284 | 0xe3,0x55,0xa5 285 | }; 286 | 287 | static const unsigned char nacl_mac[16] = { 288 | 0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5, 289 | 0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9 290 | }; 291 | 292 | /* generates a final value of (2^130 - 2) == 3 */ 293 | static const poly1305_key wrap_key = {{ 294 | 0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 295 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 296 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 297 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 298 | }}; 299 | 300 | static const unsigned char wrap_msg[16] = { 301 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, 302 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 303 | }; 304 | 305 | static const unsigned char wrap_mac[16] = { 306 | 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 307 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 308 | }; 309 | 310 | /* 311 | auth the auths of [msg:,key:0,0..,pad:ff,ff...], [msg:1,key:1,1..,pad:ff,ff...], 312 | [msg:2,2,key:2,2..,pad:ff,ff...] with the following key 313 | */ 314 | static const poly1305_key total_key = {{ 315 | 0x01,0x02,0x03,0x04,0x05,0x06,0x07, 316 | 0xff,0xfe,0xfd,0xfc,0xfb,0xfa,0xf9, 317 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff, 318 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff 319 | }}; 320 | 321 | static const unsigned char total_mac[16] = { 322 | 0xc6,0x9d,0xc3,0xb9,0x75,0xee,0x5f,0x6b, 323 | 0x28,0x99,0x57,0x94,0x41,0x27,0xd7,0x5e, 324 | }; 325 | 326 | poly1305_state state_total; 327 | poly1305_key key; 328 | unsigned char msg[256]; 329 | unsigned char mac[16]; 330 | size_t i, j; 331 | int result = 0; 332 | 333 | poly1305_opt = (poly1305_impl_t *)impl; 334 | 335 | result |= poly1305_auth_test(mac, nacl_msg, sizeof(nacl_msg), &nacl_key); 336 | result |= memcmp(nacl_mac, mac, sizeof(nacl_mac)); 337 | 338 | result |= poly1305_auth_test(mac, wrap_msg, sizeof(wrap_msg), &wrap_key); 339 | result |= memcmp(wrap_mac, mac, sizeof(wrap_mac)); 340 | 341 | poly1305_init(&state_total, &total_key); 342 | for (i = 0; i < 256; i++) { 343 | /* set key and message to 'i,i,i..', pad to 'ff,ff,ff..' */ 344 | for (j = 0; j < 16; j++) key.b[j] = i; 345 | for (j = 0; j < 16; j++) key.b[j+16] = 0xff; 346 | for (j = 0; j < i; j++) msg[j] = i; 347 | result |= poly1305_auth_test(mac, msg, i, &key); 348 | poly1305_update(&state_total, mac, 16); 349 | } 350 | poly1305_finish(&state_total, mac); 351 | result |= memcmp(total_mac, mac, sizeof(total_mac)); 352 | 353 | return result; 354 | } 355 | 356 | LIB_PUBLIC int 357 | poly1305_startup(void) { 358 | const void *opt = LOCAL_PREFIX(cpu_select)(poly1305_list, sizeof(poly1305_impl_t), poly1305_test_impl); 359 | if (opt) { 360 | poly1305_opt = (const poly1305_impl_t *)opt; 361 | return 0; 362 | } else { 363 | return 1; 364 | } 365 | } 366 | 367 | size_t 368 | poly1305_block_size_bootup(void) { 369 | size_t ret = 0; 370 | if (poly1305_startup() == 0) { 371 | ret = poly1305_opt->block_size(); 372 | } else { 373 | fprintf(stderr, "poly1305 failed to startup\n"); 374 | exit(1); 375 | } 376 | return ret; 377 | } 378 | 379 | void 380 | poly1305_init_ext_bootup(void *state, const poly1305_key *key, size_t bytes_hint) { 381 | if (poly1305_startup() == 0) { 382 | poly1305_opt->init_ext(state, key, bytes_hint); 383 | } else { 384 | fprintf(stderr, "poly1305 failed to startup\n"); 385 | exit(1); 386 | } 387 | } 388 | 389 | void 390 | poly1305_blocks_bootup(void *state, const unsigned char *in, size_t inlen) { 391 | if (poly1305_startup() == 0) { 392 | poly1305_opt->blocks(state, in, inlen); 393 | } else { 394 | fprintf(stderr, "poly1305 failed to startup\n"); 395 | exit(1); 396 | } 397 | } 398 | 399 | void 400 | poly1305_finish_ext_bootup(void *state, const unsigned char *in, size_t remaining, unsigned char *mac) { 401 | if (poly1305_startup() == 0) { 402 | poly1305_opt->finish_ext(state, in, remaining, mac); 403 | } else { 404 | fprintf(stderr, "poly1305 failed to startup\n"); 405 | exit(1); 406 | } 407 | } 408 | 409 | void 410 | poly1305_auth_bootup(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key) { 411 | if (poly1305_startup() == 0) { 412 | poly1305_opt->auth(mac, in, inlen, key); 413 | } else { 414 | fprintf(stderr, "poly1305 failed to startup\n"); 415 | exit(1); 416 | } 417 | } 418 | 419 | #if defined(UTILITIES) 420 | 421 | #include 422 | #include 423 | #include "fuzz.h" 424 | #include "bench.h" 425 | 426 | static const fuzz_variable_t fuzz_inputs[] = { 427 | {"key", FUZZ_ARRAY, 32}, 428 | {"input", FUZZ_RANDOM_LENGTH_ARRAY0, 256}, 429 | {0, FUZZ_DONE, 0} 430 | }; 431 | 432 | static const fuzz_variable_t fuzz_outputs[] = { 433 | {"auth", FUZZ_ARRAY, 16}, 434 | {0, FUZZ_DONE, 0} 435 | }; 436 | 437 | 438 | /* process the input with the given implementation and write it to the output */ 439 | static void 440 | poly1305_fuzz_impl(const void *impl, const unsigned char *in, const size_t *random_sizes, unsigned char *out) { 441 | const poly1305_key *k = (const poly1305_key *)in; 442 | const unsigned char *m = in + 32; 443 | size_t bytes = random_sizes[0]; 444 | poly1305_opt = (const poly1305_impl_t *)impl; 445 | poly1305_auth_test(out, m, bytes, k); 446 | } 447 | 448 | /* run the fuzzer on poly1305 */ 449 | void 450 | poly1305_fuzz(void) { 451 | fuzz_init(); 452 | fuzz(poly1305_list, sizeof(poly1305_impl_t), fuzz_inputs, fuzz_outputs, poly1305_fuzz_impl); 453 | } 454 | 455 | 456 | 457 | static unsigned char *bench_arr = NULL; 458 | static unsigned char bench_mac[16]; 459 | static poly1305_key bench_key = {{0}}; 460 | static size_t bench_len = 0; 461 | 462 | static void 463 | poly1305_bench_impl(const void *impl) { 464 | poly1305_opt = (const poly1305_impl_t *)impl; 465 | poly1305_auth(bench_mac, bench_arr, bench_len, &bench_key); 466 | } 467 | 468 | void 469 | poly1305_bench(void) { 470 | static const size_t lengths[] = {1, 64, 128, 576, 8192, 0}; 471 | size_t i; 472 | bench_arr = bench_get_buffer(); 473 | memset(bench_arr, 0xff, 8192); 474 | memset(&bench_key, 0xff, sizeof(bench_key)); 475 | for (i = 0; lengths[i]; i++) { 476 | bench_len = lengths[i]; 477 | bench(poly1305_list, sizeof(poly1305_impl_t), poly1305_test_impl, poly1305_bench_impl, bench_len, "byte"); 478 | } 479 | 480 | } 481 | 482 | #endif /* defined(UTILITIES) */ 483 | -------------------------------------------------------------------------------- /app/extensions/poly1305/poly1305.S: -------------------------------------------------------------------------------- 1 | #if defined(__GNUC__) 2 | #include "gcc_driver.inc" 3 | #else 4 | ;.if 0 5 | %include "yasm_driver.inc" 6 | ;.endif 7 | #endif 8 | 9 | INCLUDE_IF_AVX2_64BIT "poly1305/poly1305_avx2-64.inc" 10 | INCLUDE_IF_AVX_64BIT "poly1305/poly1305_avx-64.inc" 11 | INCLUDE_IF_SSE2_64BIT "poly1305/poly1305_sse2-64.inc" 12 | INCLUDE_IF_X86_64BIT "poly1305/poly1305_x86-64.inc" 13 | 14 | INCLUDE_IF_AVX2_32BIT "poly1305/poly1305_avx2-32.inc" 15 | INCLUDE_IF_AVX_32BIT "poly1305/poly1305_avx-32.inc" 16 | INCLUDE_IF_SSE2_32BIT "poly1305/poly1305_sse2-32.inc" 17 | INCLUDE_IF_X86_32BIT "poly1305/poly1305_x86-32.inc" 18 | 19 | #if defined(HAVE_ARMv6) 20 | #include "poly1305/poly1305_armv6-32.inc" 21 | #endif 22 | 23 | #if (defined(CPU_32BITS) && defined(HAVE_NEON)) 24 | #include "poly1305/poly1305_neon-32.inc" 25 | #endif 26 | 27 | -------------------------------------------------------------------------------- /app/extensions/poly1305/poly1305_armv6-32.inc: -------------------------------------------------------------------------------- 1 | SECTION_TEXT 2 | 3 | .arch armv5 4 | 5 | GLOBAL_HIDDEN_FN poly1305_block_size_armv6 6 | mov r0, #16 7 | bx lr 8 | FN_END poly1305_block_size_armv6 9 | 10 | .p2align 2 11 | poly1305_init_constants_armv6: 12 | .long 0x3ffffff 13 | .long 0x3ffff03 14 | .long 0x3ffc0ff 15 | .long 0x3f03fff 16 | .long 0x00fffff 17 | 18 | GLOBAL_HIDDEN_FN poly1305_init_ext_armv6 19 | poly1305_init_ext_armv6_local: 20 | stmfd sp!, {r4-r11, lr} 21 | ldmia r1!, {r2-r5} 22 | ldr r7, =poly1305_init_constants_armv6 23 | mov r8, r2 24 | mov r9, r2, lsr #26 25 | mov r10, r3, lsr #20 26 | mov r11, r4, lsr #14 27 | mov r12, r5, lsr #8 28 | orr r9, r9, r3, lsl #6 29 | orr r10, r10, r4, lsl #12 30 | orr r11, r11, r5, lsl #18 31 | ldmia r7, {r2-r6} 32 | and r2, r2, r8 33 | and r3, r3, r9 34 | and r4, r4, r10 35 | and r5, r5, r11 36 | and r6, r6, r12 37 | stmia r0!, {r2-r6} 38 | eor r2, r2, r2 39 | eor r3, r3, r3 40 | eor r4, r4, r4 41 | eor r5, r5, r5 42 | eor r6, r6, r6 43 | stmia r0!, {r2-r6} 44 | ldmia r1!, {r2-r5} 45 | stmia r0, {r2-r6} 46 | ldmfd sp!, {r4-r11, lr} 47 | bx lr 48 | FN_END poly1305_init_ext_armv6 49 | 50 | .ltorg 51 | 52 | GLOBAL_HIDDEN_FN poly1305_blocks_armv6 53 | poly1305_blocks_armv6_local: 54 | stmfd sp!, {r4-r11, lr} 55 | sub sp, sp, #128 56 | str r0, [sp, #36] 57 | str r1, [sp, #40] 58 | str r2, [sp, #44] 59 | mov r14, r1 60 | mov r12, r2 61 | ldr r8, [r0, #56] 62 | tst r8, r8 63 | eor r6, r6, r6 64 | moveq r6, #(1 << 24) 65 | str r6, [sp, #32] 66 | add r10, sp, #64 67 | ldmia r0, {r0-r9} 68 | stmia r10, {r0-r4} 69 | cmp r12, #16 70 | blo poly1305_blocks_armv6_done 71 | poly1305_blocks_armv6_mainloop: 72 | ldmia r14!, {r0-r3} 73 | mov r10, r0, lsr #26 74 | mov r11, r1, lsr #20 75 | mov r12, r2, lsr #14 76 | str r14, [sp, #40] 77 | mov r4, r3, lsr #8 78 | orr r10, r10, r1, lsl #6 79 | orr r11, r11, r2, lsl #12 80 | orr r12, r12, r3, lsl #18 81 | and r0, r0, #0x3ffffff 82 | and r10, r10, #0x3ffffff 83 | ldr r3, [sp, #32] 84 | and r11, r11, #0x3ffffff 85 | and r12, r12, #0x3ffffff 86 | add r5, r5, r0 87 | add r6, r6, r10 88 | orr r4, r4, r3 89 | add r7, r7, r11 90 | add r14, sp, #64 91 | add r8, r8, r12 92 | add r9, r9, r4 93 | ldmia r14, {r0-r4} 94 | umull r10,r11,r5,r4 95 | umull r12,r14,r5,r3 96 | umlal r10,r11,r6,r3 97 | umlal r12,r14,r6,r2 98 | umlal r10,r11,r7,r2 99 | umlal r12,r14,r7,r1 100 | add r4,r4,r4,lsl #2 101 | add r3,r3,r3,lsl #2 102 | umlal r10,r11,r8,r1 103 | umlal r12,r14,r8,r0 104 | umlal r10,r11,r9,r0 105 | umlal r12,r14,r9,r4 106 | str r10, [sp, #24] 107 | str r11, [sp, #28] 108 | str r12, [sp, #16] 109 | str r14, [sp, #20] 110 | umull r10,r11,r5,r2 111 | umull r12,r14,r5,r1 112 | umlal r10,r11,r6,r1 113 | umlal r12,r14,r6,r0 114 | umlal r10,r11,r7,r0 115 | umlal r12,r14,r7,r4 116 | add r2,r2,r2,lsl #2 117 | add r1,r1,r1,lsl #2 118 | umlal r10,r11,r8,r4 119 | umlal r12,r14,r8,r3 120 | umlal r10,r11,r9,r3 121 | umlal r12,r14,r9,r2 122 | str r10, [sp, #8] 123 | str r11, [sp, #12] 124 | str r12, [sp, #0] 125 | str r14, [sp, #4] 126 | umull r10,r11,r5,r0 127 | umlal r10,r11,r6,r4 128 | umlal r10,r11,r7,r3 129 | umlal r10,r11,r8,r2 130 | umlal r10,r11,r9,r1 131 | ldmia sp, {r0-r7} 132 | lsr r12, r10, #26 133 | lsr r14, r4, #26 134 | orr r12, r12, r11, lsl #6 135 | orr r14, r14, r5, lsl #6 136 | and r10, r10, #0x3ffffff 137 | and r4, r4, #0x3ffffff 138 | adds r0, r0, r12 139 | adc r1, r1, #0 140 | adds r6, r6, r14 141 | adc r7, r7, #0 142 | lsr r12, r0, #26 143 | lsr r14, r6, #26 144 | orr r12, r12, r1, lsl #6 145 | orr r14, r14, r7, lsl #6 146 | and r0, r0, #0x3ffffff 147 | and r6, r6, #0x3ffffff 148 | add r14, r14, r14, lsl #2 149 | adds r2, r2, r12 150 | adc r3, r3, #0 151 | add r10, r10, r14 152 | lsr r12, r2, #26 153 | lsr r14, r10, #26 154 | orr r12, r12, r3, lsl #6 155 | and r5, r10, #0x3ffffff 156 | and r7, r2, #0x3ffffff 157 | add r4, r4, r12 158 | add r0, r0, r14 159 | lsr r12, r4, #26 160 | and r8, r4, #0x3ffffff 161 | add r9, r6, r12 162 | ldr r12, [sp, #44] 163 | ldr r14, [sp, #40] 164 | mov r6, r0 165 | cmp r12, #32 166 | sub r12, r12, #16 167 | str r12, [sp, #44] 168 | bhs poly1305_blocks_armv6_mainloop 169 | poly1305_blocks_armv6_done: 170 | ldr r12, [sp, #36] 171 | str r5, [r12, #20] 172 | str r6, [r12, #24] 173 | str r7, [r12, #28] 174 | str r8, [r12, #32] 175 | str r9, [r12, #36] 176 | add sp, sp, #128 177 | ldmfd sp!, {r4-r11, lr} 178 | bx lr 179 | FN_END poly1305_blocks_armv6 180 | 181 | GLOBAL_HIDDEN_FN poly1305_finish_ext_armv6 182 | poly1305_finish_ext_armv6_local: 183 | stmfd sp!, {r4-r11, lr} 184 | sub sp, sp, #16 185 | mov r5, r0 186 | mov r6, r1 187 | mov r7, r2 188 | mov r8, r3 189 | ands r2, r2, r2 190 | beq poly1305_finish_ext_armv6_noremaining 191 | eor r0, r0 192 | mov r9, sp 193 | str r0, [sp, #0] 194 | str r0, [sp, #4] 195 | str r0, [sp, #8] 196 | str r0, [sp, #12] 197 | tst r2, #8 198 | beq poly1305_finish_ext_armv6_skip8 199 | ldmia r1!, {r10-r11} 200 | stmia r9!, {r10-r11} 201 | poly1305_finish_ext_armv6_skip8: 202 | tst r2, #4 203 | beq poly1305_finish_ext_armv6_skip4 204 | ldr r10, [r1], #4 205 | str r10, [r9], #4 206 | poly1305_finish_ext_armv6_skip4: 207 | tst r2, #2 208 | beq poly1305_finish_ext_armv6_skip2 209 | ldrh r10, [r1], #2 210 | strh r10, [r9], #2 211 | poly1305_finish_ext_armv6_skip2: 212 | tst r2, #1 213 | beq poly1305_finish_ext_armv6_skip1 214 | ldrb r10, [r1], #1 215 | strb r10, [r9], #1 216 | poly1305_finish_ext_armv6_skip1: 217 | mov r11, #1 218 | strb r11, [r9] 219 | str r11, [r5, #56] 220 | mov r0, r5 221 | mov r1, sp 222 | mov r2, #16 223 | bl poly1305_blocks_armv6_local 224 | poly1305_finish_ext_armv6_noremaining: 225 | ldr r0, [r5, #20] 226 | ldr r1, [r5, #24] 227 | ldr r2, [r5, #28] 228 | ldr r3, [r5, #32] 229 | ldr r4, [r5, #36] 230 | mov r12, r4, lsr #26 231 | and r4, r4, #0x3ffffff 232 | add r12, r12, r12, lsl #2 233 | add r0, r0, r12 234 | mov r12, r0, lsr #26 235 | and r0, r0, #0x3ffffff 236 | add r1, r1, r12 237 | mov r12, r1, lsr #26 238 | and r1, r1, #0x3ffffff 239 | add r2, r2, r12 240 | mov r12, r2, lsr #26 241 | and r2, r2, #0x3ffffff 242 | add r3, r3, r12 243 | mov r12, r3, lsr #26 244 | and r3, r3, #0x3ffffff 245 | add r4, r4, r12 246 | add r6, r0, #5 247 | mov r12, r6, lsr #26 248 | and r6, r6, #0x3ffffff 249 | add r7, r1, r12 250 | mov r12, r7, lsr #26 251 | and r7, r7, #0x3ffffff 252 | add r10, r2, r12 253 | mov r12, r10, lsr #26 254 | and r10, r10, #0x3ffffff 255 | add r11, r3, r12 256 | mov r12, #-(1 << 26) 257 | add r12, r12, r11, lsr #26 258 | and r11, r11, #0x3ffffff 259 | add r14, r4, r12 260 | mov r12, r14, lsr #31 261 | sub r12, #1 262 | and r6, r6, r12 263 | and r7, r7, r12 264 | and r10, r10, r12 265 | and r11, r11, r12 266 | and r14, r14, r12 267 | mvn r12, r12 268 | and r0, r0, r12 269 | and r1, r1, r12 270 | and r2, r2, r12 271 | and r3, r3, r12 272 | and r4, r4, r12 273 | orr r0, r0, r6 274 | orr r1, r1, r7 275 | orr r2, r2, r10 276 | orr r3, r3, r11 277 | orr r4, r4, r14 278 | orr r0, r0, r1, lsl #26 279 | lsr r1, r1, #6 280 | orr r1, r1, r2, lsl #20 281 | lsr r2, r2, #12 282 | orr r2, r2, r3, lsl #14 283 | lsr r3, r3, #18 284 | orr r3, r3, r4, lsl #8 285 | ldr r6, [r5, #40] 286 | ldr r7, [r5, #44] 287 | ldr r10, [r5, #48] 288 | ldr r11, [r5, #52] 289 | adds r0, r0, r6 290 | adcs r1, r1, r7 291 | adcs r2, r2, r10 292 | adcs r3, r3, r11 293 | stmia r8, {r0-r3} 294 | mov r12, r5 295 | eor r0, r0, r0 296 | eor r1, r1, r1 297 | eor r2, r2, r2 298 | eor r3, r3, r3 299 | eor r4, r4, r4 300 | eor r5, r5, r5 301 | eor r6, r6, r6 302 | eor r7, r7, r7 303 | stmia r12!, {r0-r7} 304 | stmia r12, {r0-r7} 305 | add sp, sp, #16 306 | ldmfd sp!, {r4-r11, lr} 307 | bx lr 308 | FN_END poly1305_finish_ext_armv6 309 | 310 | GLOBAL_HIDDEN_FN poly1305_auth_armv6 311 | poly1305_auth_armv6_local: 312 | stmfd sp!, {r4-r8, lr} 313 | mov r8, sp 314 | and sp, sp, #(~63) 315 | sub sp, sp, #64 316 | mov r4, r0 317 | mov r5, r1 318 | mov r6, r2 319 | mov r7, r3 320 | mov r0, sp 321 | mov r1, r7 322 | bl poly1305_init_ext_armv6_local 323 | ands r2, r6, #(~15) 324 | beq poly1305_auth_armv6_noblocks 325 | mov r0, sp 326 | mov r1, r5 327 | add r5, r5, r2 328 | sub r6, r6, r2 329 | bl poly1305_blocks_armv6_local 330 | poly1305_auth_armv6_noblocks: 331 | mov r0, sp 332 | mov r1, r5 333 | mov r2, r6 334 | mov r3, r4 335 | bl poly1305_finish_ext_armv6_local 336 | mov sp, r8 337 | ldmfd sp!, {r4-r8, lr} 338 | bx lr 339 | FN_END poly1305_auth_armv6 340 | 341 | -------------------------------------------------------------------------------- /app/extensions/poly1305/poly1305_avx-64.inc: -------------------------------------------------------------------------------- 1 | SECTION_TEXT 2 | 3 | GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx,0,0 4 | movl $32, %eax 5 | ret 6 | FN_END poly1305_block_size_avx 7 | 8 | GLOBAL_HIDDEN_FN_EXT poly1305_init_ext_avx,4,1 9 | poly1305_init_ext_avx_local: 10 | pushq %r15 11 | pushq %r14 12 | pushq %r13 13 | pushq %r12 14 | pushq %rbp 15 | pushq %rbx 16 | movq %rdi, %rbp 17 | testq %rdx, %rdx 18 | movq $-1, %rax 19 | cmovne %rdx, %rax 20 | movq %rax, -16(%rsp) 21 | vpxor %xmm0, %xmm0, %xmm0 22 | vmovdqu %xmm0, (%rdi) 23 | vmovdqu %xmm0, 16(%rdi) 24 | vmovdqu %xmm0, 32(%rdi) 25 | movq (%rsi), %r9 26 | movq 8(%rsi), %r8 27 | movabsq $17575274610687, %r10 28 | andq %r9, %r10 29 | shrq $44, %r9 30 | movq %r8, %rax 31 | salq $20, %rax 32 | orq %rax, %r9 33 | movabsq $17592181915647, %rax 34 | andq %rax, %r9 35 | shrq $24, %r8 36 | movabsq $68719475727, %rax 37 | andq %rax, %r8 38 | leaq 40(%rdi), %r15 39 | movl %r10d, %eax 40 | andl $67108863, %eax 41 | movl %eax, 40(%rdi) 42 | movl %r9d, %edx 43 | sall $18, %edx 44 | movq %r10, %rax 45 | shrq $26, %rax 46 | orl %edx, %eax 47 | andl $67108863, %eax 48 | movl %eax, 44(%rdi) 49 | movq %r9, %rax 50 | shrq $8, %rax 51 | andl $67108863, %eax 52 | movl %eax, 48(%rdi) 53 | movq %r9, %rax 54 | shrq $34, %rax 55 | movl %r8d, %edx 56 | sall $10, %edx 57 | orl %edx, %eax 58 | andl $67108863, %eax 59 | movl %eax, 52(%rdi) 60 | movq %r8, %rax 61 | shrq $16, %rax 62 | movl %eax, 56(%rdi) 63 | movq 16(%rsi), %rax 64 | movq %rax, 104(%rdi) 65 | movq 24(%rsi), %rax 66 | movq %rax, 112(%rdi) 67 | movl $0, %ebx 68 | .L7: 69 | testq %rbx, %rbx 70 | jne .L4 71 | leaq 60(%rbp), %r15 72 | cmpq $16, -16(%rsp) 73 | ja .L6 74 | jmp .L5 75 | .L4: 76 | cmpq $1, %rbx 77 | jne .L6 78 | leaq 80(%rbp), %r15 79 | cmpq $95, -16(%rsp) 80 | jbe .L5 81 | .L6: 82 | leaq (%r8,%r8,4), %rsi 83 | salq $2, %rsi 84 | leaq (%r9,%r9), %rdi 85 | movq %rdi, %rax 86 | mulq %rsi 87 | movq %rax, %r13 88 | movq %rdx, %r14 89 | movq %r10, %rax 90 | mulq %r10 91 | addq %r13, %rax 92 | adcq %r14, %rdx 93 | movabsq $17592186044415, %rcx 94 | movq %rax, -72(%rsp) 95 | movq %rdx, -64(%rsp) 96 | andq -72(%rsp), %rcx 97 | leaq (%r10,%r10), %r11 98 | movq %r11, %rax 99 | mulq %r9 100 | movq %rax, %r11 101 | movq %rdx, %r12 102 | movq %rsi, %rax 103 | mulq %r8 104 | movq %rax, %r13 105 | movq %rdx, %r14 106 | addq %r11, %r13 107 | adcq %r12, %r14 108 | movq -72(%rsp), %rax 109 | movq -64(%rsp), %rdx 110 | shrdq $44, %rdx, %rax 111 | movq %rax, -56(%rsp) 112 | movq $0, -48(%rsp) 113 | addq -56(%rsp), %r13 114 | adcq -48(%rsp), %r14 115 | movabsq $17592186044415, %rsi 116 | andq %r13, %rsi 117 | leaq (%r8,%r8), %rdi 118 | movq %rdi, %rax 119 | mulq %r10 120 | movq %rax, %r11 121 | movq %rdx, %r12 122 | movq %r9, %rax 123 | mulq %r9 124 | addq %r11, %rax 125 | adcq %r12, %rdx 126 | shrdq $44, %r14, %r13 127 | movq %r13, -40(%rsp) 128 | movq $0, -32(%rsp) 129 | addq -40(%rsp), %rax 130 | adcq -32(%rsp), %rdx 131 | movabsq $4398046511103, %rdi 132 | andq %rax, %rdi 133 | shrdq $42, %rdx, %rax 134 | leaq (%rax,%rax,4), %r8 135 | addq %rcx, %r8 136 | movabsq $17592186044415, %r10 137 | andq %r8, %r10 138 | shrq $44, %r8 139 | addq %rsi, %r8 140 | movabsq $17592186044415, %r9 141 | andq %r8, %r9 142 | shrq $44, %r8 143 | addq %rdi, %r8 144 | movl %r10d, %eax 145 | andl $67108863, %eax 146 | movl %eax, (%r15) 147 | movl %r9d, %edx 148 | sall $18, %edx 149 | movq %r10, %rax 150 | shrq $26, %rax 151 | orl %edx, %eax 152 | andl $67108863, %eax 153 | movl %eax, 4(%r15) 154 | movq %r9, %rax 155 | shrq $8, %rax 156 | andl $67108863, %eax 157 | movl %eax, 8(%r15) 158 | movl %r8d, %edx 159 | sall $10, %edx 160 | movq %r9, %rax 161 | shrq $34, %rax 162 | orl %edx, %eax 163 | andl $67108863, %eax 164 | movl %eax, 12(%r15) 165 | movq %r8, %rax 166 | shrq $16, %rax 167 | movl %eax, 16(%r15) 168 | addq $1, %rbx 169 | cmpq $2, %rbx 170 | jne .L7 171 | .L5: 172 | movq $0, 120(%rbp) 173 | popq %rbx 174 | popq %rbp 175 | popq %r12 176 | popq %r13 177 | popq %r14 178 | popq %r15 179 | ret 180 | FN_END poly1305_init_ext_avx 181 | 182 | 183 | 184 | GLOBAL_HIDDEN_FN poly1305_blocks_avx 185 | poly1305_blocks_avx_local: 186 | pushq %rbp 187 | movq %rsp, %rbp 188 | pushq %rbx 189 | andq $-64, %rsp 190 | subq $200, %rsp 191 | movl $(1 << 24), %eax 192 | movl $((1 << 26) - 1), %r8d 193 | movl $(5), %r9d 194 | vmovd %eax, %xmm1 195 | vmovd %r8d, %xmm0 196 | vmovd %r9d, %xmm2 197 | vpshufd $68, %xmm1, %xmm1 198 | vpshufd $68, %xmm0, %xmm0 199 | vpshufd $68, %xmm2, %xmm2 200 | vmovdqa %xmm1, 152(%rsp) 201 | vmovdqa %xmm2, 184(%rsp) 202 | movq 120(%rdi), %rax 203 | testb $4, %al 204 | je .L12 205 | vpsrldq $8, %xmm1, %xmm1 206 | vmovdqa %xmm1, 152(%rsp) 207 | .L12: 208 | testb $8, %al 209 | je .L13 210 | vpxor %xmm1, %xmm1, %xmm1 211 | vmovdqa %xmm1, 152(%rsp) 212 | .L13: 213 | testb $1, %al 214 | jne .L14 215 | vmovq (%rsi), %xmm1 216 | vpinsrq $1, 16(%rsi), %xmm1, %xmm1 217 | vmovq 8(%rsi), %xmm3 218 | vpinsrq $1, 24(%rsi), %xmm3, %xmm2 219 | vpand %xmm0, %xmm1, %xmm7 220 | vpsrlq $26, %xmm1, %xmm12 221 | vpand %xmm0, %xmm12, %xmm12 222 | vpsllq $12, %xmm2, %xmm3 223 | vpsrlq $52, %xmm1, %xmm1 224 | vpor %xmm3, %xmm1, %xmm6 225 | vpand %xmm0, %xmm6, %xmm3 226 | vpsrlq $26, %xmm6, %xmm6 227 | vpand %xmm0, %xmm6, %xmm6 228 | vpsrlq $40, %xmm2, %xmm2 229 | vpor 152(%rsp), %xmm2, %xmm2 230 | addq $32, %rsi 231 | subq $32, %rdx 232 | orq $1, %rax 233 | movq %rax, 120(%rdi) 234 | jmp .L15 235 | .L14: 236 | vmovdqu (%rdi), %xmm12 237 | vmovdqu 16(%rdi), %xmm6 238 | vmovdqu 32(%rdi), %xmm2 239 | vpshufd $80, %xmm12, %xmm7 240 | vpshufd $250, %xmm12, %xmm12 241 | vpshufd $80, %xmm6, %xmm3 242 | vpshufd $250, %xmm6, %xmm6 243 | vpshufd $80, %xmm2, %xmm2 244 | .L15: 245 | movq 120(%rdi), %rax 246 | testb $48, %al 247 | je .L16 248 | testb $16, %al 249 | je .L17 250 | vmovdqu 40(%rdi), %xmm1 251 | vmovd 56(%rdi), %xmm4 252 | vmovdqu 60(%rdi), %xmm5 253 | vpunpckldq %xmm1, %xmm5, %xmm11 254 | vpunpckhdq %xmm1, %xmm5, %xmm5 255 | vmovd 76(%rdi), %xmm1 256 | vpunpcklqdq %xmm4, %xmm1, %xmm4 257 | jmp .L18 258 | .L17: 259 | movl $(1), %r8d 260 | vmovdqu 40(%rdi), %xmm5 261 | vmovd 56(%rdi), %xmm4 262 | vmovd %r8d, %xmm1 263 | vpunpckldq %xmm1, %xmm5, %xmm11 264 | vpunpckhdq %xmm1, %xmm5, %xmm5 265 | .L18: 266 | vpshufd $80, %xmm11, %xmm1 267 | vpshufd $250, %xmm11, %xmm11 268 | vpshufd $80, %xmm5, %xmm10 269 | vpshufd $250, %xmm5, %xmm5 270 | jmp .L19 271 | .L16: 272 | vmovdqu 60(%rdi), %xmm5 273 | vpshufd $0, %xmm5, %xmm1 274 | vpshufd $85, %xmm5, %xmm11 275 | vpshufd $170, %xmm5, %xmm10 276 | vpshufd $255, %xmm5, %xmm5 277 | vmovd 76(%rdi), %xmm4 278 | vpshufd $0, %xmm4, %xmm4 279 | .L19: 280 | vmovdqa %xmm11, 136(%rsp) 281 | vpmuludq 184(%rsp), %xmm11, %xmm13 282 | vmovdqa %xmm13, 120(%rsp) 283 | vmovdqa %xmm10, 104(%rsp) 284 | vpmuludq 184(%rsp), %xmm10, %xmm13 285 | vmovdqa %xmm13, 88(%rsp) 286 | vmovdqa %xmm5, 72(%rsp) 287 | vpmuludq 184(%rsp), %xmm5, %xmm5 288 | vmovdqa %xmm5, 56(%rsp) 289 | vmovdqa %xmm4, 40(%rsp) 290 | vpmuludq 184(%rsp), %xmm4, %xmm4 291 | vmovdqa %xmm4, 24(%rsp) 292 | cmpq $63, %rdx 293 | jbe .L20 294 | vmovdqu 80(%rdi), %xmm4 295 | vpshufd $0, %xmm4, %xmm5 296 | vmovdqa %xmm5, 8(%rsp) 297 | vpshufd $85, %xmm4, %xmm5 298 | vmovdqa %xmm5, -8(%rsp) 299 | vpshufd $170, %xmm4, %xmm13 300 | vmovdqa %xmm13, -24(%rsp) 301 | vpshufd $255, %xmm4, %xmm4 302 | vmovdqa %xmm4, %xmm10 303 | vmovdqa %xmm4, -40(%rsp) 304 | vmovd 96(%rdi), %xmm4 305 | vpshufd $0, %xmm4, %xmm4 306 | vmovdqa %xmm4, %xmm8 307 | vmovdqa %xmm4, -56(%rsp) 308 | vpmuludq 184(%rsp), %xmm5, %xmm4 309 | vmovdqa %xmm4, -72(%rsp) 310 | vpmuludq 184(%rsp), %xmm13, %xmm4 311 | vmovdqa %xmm4, -88(%rsp) 312 | vpmuludq 184(%rsp), %xmm10, %xmm4 313 | vmovdqa %xmm4, -104(%rsp) 314 | vpmuludq 184(%rsp), %xmm8, %xmm4 315 | vmovdqa %xmm4, -120(%rsp) 316 | leaq 32(%rsi), %rax 317 | movq %rdx, %rcx 318 | vmovdqa %xmm1, 168(%rsp) 319 | jmp .L22 320 | .p2align 6 321 | nop 322 | nop 323 | nop 324 | nop 325 | .L22: 326 | vpmuludq -72(%rsp), %xmm2, %xmm13 327 | vmovdqa -88(%rsp), %xmm5 328 | vpmuludq %xmm5, %xmm6, %xmm4 329 | vpmuludq %xmm5, %xmm2, %xmm11 330 | vmovdqa -104(%rsp), %xmm9 331 | vpmuludq %xmm9, %xmm6, %xmm5 332 | vpmuludq %xmm9, %xmm2, %xmm10 333 | vpaddq %xmm4, %xmm13, %xmm13 334 | vpmuludq %xmm9, %xmm3, %xmm4 335 | vmovdqa -120(%rsp), %xmm8 336 | vpmuludq %xmm8, %xmm2, %xmm9 337 | vpaddq %xmm5, %xmm11, %xmm11 338 | vmovdqa %xmm8, %xmm5 339 | vpmuludq %xmm8, %xmm12, %xmm8 340 | vpmuludq %xmm5, %xmm3, %xmm14 341 | vpaddq %xmm4, %xmm13, %xmm13 342 | vpmuludq %xmm5, %xmm6, %xmm4 343 | vmovdqa 8(%rsp), %xmm15 344 | vpmuludq %xmm15, %xmm6, %xmm5 345 | vpaddq %xmm8, %xmm13, %xmm13 346 | vpmuludq %xmm15, %xmm2, %xmm8 347 | vpaddq %xmm14, %xmm11, %xmm11 348 | vpmuludq %xmm15, %xmm7, %xmm14 349 | vpaddq %xmm4, %xmm10, %xmm10 350 | vpmuludq %xmm15, %xmm12, %xmm4 351 | vpaddq %xmm5, %xmm9, %xmm9 352 | vpmuludq %xmm15, %xmm3, %xmm5 353 | vmovdqa -8(%rsp), %xmm15 354 | vpmuludq %xmm15, %xmm3, %xmm2 355 | vpaddq %xmm14, %xmm13, %xmm13 356 | vpmuludq %xmm15, %xmm6, %xmm6 357 | vpaddq %xmm4, %xmm11, %xmm11 358 | vpmuludq %xmm15, %xmm7, %xmm4 359 | vpaddq %xmm5, %xmm10, %xmm10 360 | vmovq -32(%rax), %xmm5 361 | vpinsrq $1, -16(%rax), %xmm5, %xmm5 362 | vpmuludq %xmm15, %xmm12, %xmm14 363 | vpaddq %xmm2, %xmm9, %xmm9 364 | vmovdqa -24(%rsp), %xmm2 365 | vpmuludq %xmm2, %xmm12, %xmm15 366 | vpaddq %xmm6, %xmm8, %xmm8 367 | vpmuludq %xmm2, %xmm3, %xmm3 368 | vpaddq %xmm4, %xmm11, %xmm11 369 | vmovq -24(%rax), %xmm4 370 | vpinsrq $1, -8(%rax), %xmm4, %xmm6 371 | vpmuludq %xmm2, %xmm7, %xmm4 372 | vpaddq %xmm14, %xmm10, %xmm10 373 | vmovdqa -40(%rsp), %xmm1 374 | vpmuludq %xmm1, %xmm7, %xmm14 375 | vpaddq %xmm15, %xmm9, %xmm9 376 | vpand %xmm5, %xmm0, %xmm2 377 | vpmuludq %xmm1, %xmm12, %xmm12 378 | vpaddq %xmm3, %xmm8, %xmm8 379 | vpsrlq $26, %xmm5, %xmm3 380 | vpand %xmm3, %xmm0, %xmm3 381 | vpmuludq -56(%rsp), %xmm7, %xmm7 382 | vpaddq %xmm4, %xmm10, %xmm10 383 | vpsllq $12, %xmm6, %xmm15 384 | vpsrlq $52, %xmm5, %xmm4 385 | vpor %xmm15, %xmm4, %xmm4 386 | vpaddq %xmm14, %xmm9, %xmm9 387 | vpsrlq $14, %xmm6, %xmm5 388 | vpand %xmm5, %xmm0, %xmm5 389 | vpaddq %xmm12, %xmm8, %xmm8 390 | vpand %xmm4, %xmm0, %xmm4 391 | vpaddq %xmm7, %xmm8, %xmm8 392 | vpsrlq $40, %xmm6, %xmm6 393 | vpor 152(%rsp), %xmm6, %xmm6 394 | vmovdqu (%rax), %xmm12 395 | vmovdqu 16(%rax), %xmm7 396 | vpunpckldq %xmm7, %xmm12, %xmm15 397 | vpunpckhdq %xmm7, %xmm12, %xmm7 398 | vpxor %xmm14, %xmm14, %xmm14 399 | vpunpckldq %xmm14, %xmm15, %xmm12 400 | vpunpckhdq %xmm14, %xmm15, %xmm15 401 | vpunpckldq %xmm14, %xmm7, %xmm14 402 | vpxor %xmm1, %xmm1, %xmm1 403 | vpunpckhdq %xmm1, %xmm7, %xmm7 404 | vpsllq $6, %xmm15, %xmm15 405 | vpsllq $12, %xmm14, %xmm14 406 | vpsllq $18, %xmm7, %xmm7 407 | vpaddq %xmm12, %xmm13, %xmm12 408 | vpaddq %xmm15, %xmm11, %xmm15 409 | vpaddq %xmm14, %xmm10, %xmm14 410 | vpaddq %xmm7, %xmm9, %xmm7 411 | vpaddq 152(%rsp), %xmm8, %xmm8 412 | vpmuludq 120(%rsp), %xmm6, %xmm13 413 | vmovdqa 88(%rsp), %xmm10 414 | vpmuludq %xmm10, %xmm5, %xmm9 415 | vpmuludq %xmm10, %xmm6, %xmm11 416 | vmovdqa 56(%rsp), %xmm1 417 | vpmuludq %xmm1, %xmm5, %xmm10 418 | vpaddq %xmm13, %xmm12, %xmm12 419 | vpmuludq %xmm1, %xmm6, %xmm13 420 | vpaddq %xmm9, %xmm12, %xmm12 421 | vpmuludq %xmm1, %xmm4, %xmm9 422 | vpaddq %xmm11, %xmm15, %xmm15 423 | vmovdqa 24(%rsp), %xmm1 424 | vpmuludq %xmm1, %xmm6, %xmm11 425 | vpaddq %xmm10, %xmm15, %xmm10 426 | vpmuludq %xmm1, %xmm3, %xmm15 427 | vpaddq %xmm13, %xmm14, %xmm14 428 | vpmuludq %xmm1, %xmm4, %xmm13 429 | vpaddq %xmm9, %xmm12, %xmm9 430 | vpmuludq %xmm1, %xmm5, %xmm12 431 | vpaddq %xmm11, %xmm7, %xmm7 432 | vpmuludq 168(%rsp), %xmm5, %xmm11 433 | vpaddq %xmm15, %xmm9, %xmm9 434 | vpmuludq 168(%rsp), %xmm6, %xmm6 435 | vpaddq %xmm13, %xmm10, %xmm10 436 | vpmuludq 168(%rsp), %xmm2, %xmm15 437 | vpaddq %xmm12, %xmm14, %xmm14 438 | vpmuludq 168(%rsp), %xmm3, %xmm13 439 | vpaddq %xmm11, %xmm7, %xmm11 440 | vpmuludq 168(%rsp), %xmm4, %xmm12 441 | vpaddq %xmm6, %xmm8, %xmm6 442 | vmovdqa 136(%rsp), %xmm8 443 | vpmuludq %xmm8, %xmm4, %xmm7 444 | vpaddq %xmm15, %xmm9, %xmm9 445 | vpmuludq %xmm8, %xmm5, %xmm5 446 | vpaddq %xmm13, %xmm10, %xmm10 447 | vpmuludq %xmm8, %xmm2, %xmm15 448 | vpaddq %xmm12, %xmm14, %xmm14 449 | vpmuludq %xmm8, %xmm3, %xmm8 450 | vpaddq %xmm7, %xmm11, %xmm11 451 | vmovdqa 104(%rsp), %xmm7 452 | vpmuludq %xmm7, %xmm3, %xmm13 453 | vpaddq %xmm5, %xmm6, %xmm6 454 | vpmuludq %xmm7, %xmm4, %xmm4 455 | vpaddq %xmm15, %xmm10, %xmm10 456 | vpmuludq %xmm7, %xmm2, %xmm15 457 | vpaddq %xmm8, %xmm14, %xmm14 458 | vmovdqa 72(%rsp), %xmm5 459 | vpmuludq %xmm5, %xmm2, %xmm7 460 | vpaddq %xmm13, %xmm11, %xmm11 461 | vpmuludq %xmm5, %xmm3, %xmm3 462 | vpaddq %xmm4, %xmm6, %xmm6 463 | vpmuludq 40(%rsp), %xmm2, %xmm2 464 | vpaddq %xmm15, %xmm14, %xmm14 465 | vpaddq %xmm7, %xmm11, %xmm11 466 | vpaddq %xmm3, %xmm6, %xmm6 467 | vpaddq %xmm2, %xmm6, %xmm2 468 | vpsrlq $26, %xmm9, %xmm12 469 | vpsrlq $26, %xmm11, %xmm5 470 | vpand %xmm0, %xmm9, %xmm9 471 | vpand %xmm0, %xmm11, %xmm11 472 | vpaddq %xmm12, %xmm10, %xmm10 473 | vpaddq %xmm5, %xmm2, %xmm2 474 | vpsrlq $26, %xmm10, %xmm3 475 | vpsrlq $26, %xmm2, %xmm7 476 | vpand %xmm0, %xmm10, %xmm10 477 | vpand %xmm0, %xmm2, %xmm2 478 | vpaddq %xmm3, %xmm14, %xmm3 479 | vpmuludq 184(%rsp), %xmm7, %xmm7 480 | vpaddq %xmm7, %xmm9, %xmm9 481 | vpsrlq $26, %xmm3, %xmm6 482 | vpsrlq $26, %xmm9, %xmm12 483 | vpand %xmm0, %xmm3, %xmm3 484 | vpand %xmm0, %xmm9, %xmm7 485 | vpaddq %xmm6, %xmm11, %xmm6 486 | vpaddq %xmm12, %xmm10, %xmm12 487 | vpsrlq $26, %xmm6, %xmm8 488 | vpand %xmm0, %xmm6, %xmm6 489 | vpaddq %xmm8, %xmm2, %xmm2 490 | subq $64, %rcx 491 | addq $64, %rax 492 | cmpq $63, %rcx 493 | ja .L22 494 | vmovdqa 168(%rsp), %xmm1 495 | leaq -64(%rdx), %rax 496 | andq $-64, %rax 497 | leaq 64(%rsi,%rax), %rsi 498 | andl $63, %edx 499 | .L20: 500 | cmpq $31, %rdx 501 | jbe .L23 502 | vpmuludq 120(%rsp), %xmm2, %xmm11 503 | vmovdqa 88(%rsp), %xmm4 504 | vpmuludq %xmm4, %xmm6, %xmm0 505 | vpmuludq %xmm4, %xmm2, %xmm10 506 | vmovdqa 56(%rsp), %xmm4 507 | vpmuludq %xmm4, %xmm6, %xmm8 508 | vpmuludq %xmm4, %xmm2, %xmm5 509 | vpaddq %xmm0, %xmm11, %xmm11 510 | vpmuludq %xmm4, %xmm3, %xmm0 511 | vmovdqa 24(%rsp), %xmm13 512 | vpmuludq %xmm13, %xmm2, %xmm4 513 | vpaddq %xmm8, %xmm10, %xmm10 514 | vpmuludq %xmm13, %xmm12, %xmm8 515 | vpmuludq %xmm13, %xmm3, %xmm9 516 | vpaddq %xmm0, %xmm11, %xmm11 517 | vpmuludq %xmm13, %xmm6, %xmm13 518 | vpmuludq %xmm1, %xmm6, %xmm0 519 | vpaddq %xmm8, %xmm11, %xmm8 520 | vpmuludq %xmm1, %xmm2, %xmm2 521 | vpaddq %xmm9, %xmm10, %xmm9 522 | vpmuludq %xmm1, %xmm7, %xmm11 523 | vpaddq %xmm13, %xmm5, %xmm5 524 | vpmuludq %xmm1, %xmm12, %xmm10 525 | vpaddq %xmm0, %xmm4, %xmm0 526 | vpmuludq %xmm1, %xmm3, %xmm1 527 | vmovdqa 136(%rsp), %xmm4 528 | vpmuludq %xmm4, %xmm3, %xmm14 529 | vpaddq %xmm11, %xmm8, %xmm11 530 | vpmuludq %xmm4, %xmm6, %xmm6 531 | vpaddq %xmm10, %xmm9, %xmm9 532 | vpmuludq %xmm4, %xmm7, %xmm15 533 | vpaddq %xmm1, %xmm5, %xmm5 534 | vpmuludq %xmm4, %xmm12, %xmm1 535 | vpaddq %xmm14, %xmm0, %xmm0 536 | vmovdqa 104(%rsp), %xmm4 537 | vpmuludq %xmm4, %xmm12, %xmm8 538 | vpaddq %xmm6, %xmm2, %xmm2 539 | vpmuludq %xmm4, %xmm3, %xmm3 540 | vpaddq %xmm15, %xmm9, %xmm9 541 | vpmuludq %xmm4, %xmm7, %xmm10 542 | vpaddq %xmm1, %xmm5, %xmm1 543 | vmovdqa 72(%rsp), %xmm4 544 | vpmuludq %xmm4, %xmm7, %xmm15 545 | vpaddq %xmm8, %xmm0, %xmm0 546 | vpmuludq %xmm4, %xmm12, %xmm12 547 | vpaddq %xmm3, %xmm2, %xmm2 548 | vpmuludq 40(%rsp), %xmm7, %xmm7 549 | vpaddq %xmm10, %xmm1, %xmm1 550 | vpaddq %xmm15, %xmm0, %xmm0 551 | vpaddq %xmm12, %xmm2, %xmm2 552 | vpaddq %xmm7, %xmm2, %xmm2 553 | movl $((1 << 26) - 1), %r8d 554 | testq %rsi, %rsi 555 | vmovd %r8d, %xmm15 556 | je .L24 557 | vmovdqu (%rsi), %xmm4 558 | vmovdqu 16(%rsi), %xmm3 559 | vpunpckldq %xmm3, %xmm4, %xmm5 560 | vpunpckhdq %xmm3, %xmm4, %xmm3 561 | vpxor %xmm4, %xmm4, %xmm4 562 | vpunpckldq %xmm4, %xmm5, %xmm7 563 | vpunpckhdq %xmm4, %xmm5, %xmm5 564 | vpunpckldq %xmm4, %xmm3, %xmm6 565 | vpunpckhdq %xmm4, %xmm3, %xmm3 566 | vpsllq $6, %xmm5, %xmm5 567 | vpsllq $12, %xmm6, %xmm6 568 | vpsllq $18, %xmm3, %xmm3 569 | vpaddq %xmm7, %xmm11, %xmm11 570 | vpaddq %xmm5, %xmm9, %xmm9 571 | vpaddq %xmm6, %xmm1, %xmm1 572 | vpaddq %xmm3, %xmm0, %xmm0 573 | vpaddq 152(%rsp), %xmm2, %xmm2 574 | .L24: 575 | vpshufd $68, %xmm15, %xmm15 576 | vpsrlq $26, %xmm11, %xmm12 577 | vpsrlq $26, %xmm0, %xmm3 578 | vpand %xmm15, %xmm11, %xmm11 579 | vpand %xmm15, %xmm0, %xmm6 580 | vpaddq %xmm12, %xmm9, %xmm9 581 | vpaddq %xmm3, %xmm2, %xmm2 582 | vpsrlq $26, %xmm9, %xmm3 583 | vpsrlq $26, %xmm2, %xmm7 584 | vpand %xmm15, %xmm9, %xmm9 585 | vpand %xmm15, %xmm2, %xmm2 586 | vpaddq %xmm3, %xmm1, %xmm3 587 | vpmuludq 184(%rsp), %xmm7, %xmm7 588 | vpaddq %xmm7, %xmm11, %xmm7 589 | vpsrlq $26, %xmm3, %xmm4 590 | vpsrlq $26, %xmm7, %xmm1 591 | vpand %xmm15, %xmm3, %xmm3 592 | vpand %xmm15, %xmm7, %xmm7 593 | vpaddq %xmm4, %xmm6, %xmm6 594 | vpaddq %xmm1, %xmm9, %xmm12 595 | vpsrlq $26, %xmm6, %xmm0 596 | vpand %xmm15, %xmm6, %xmm6 597 | vpaddq %xmm0, %xmm2, %xmm2 598 | .L23: 599 | testq %rsi, %rsi 600 | je .L25 601 | vpshufd $8, %xmm7, %xmm7 602 | vpshufd $8, %xmm12, %xmm12 603 | vpshufd $8, %xmm3, %xmm3 604 | vpshufd $8, %xmm6, %xmm6 605 | vpshufd $8, %xmm2, %xmm2 606 | vpunpcklqdq %xmm12, %xmm7, %xmm7 607 | vpunpcklqdq %xmm6, %xmm3, %xmm3 608 | vmovdqu %xmm7, (%rdi) 609 | vmovdqu %xmm3, 16(%rdi) 610 | vmovq %xmm2, 32(%rdi) 611 | jmp .L11 612 | .L25: 613 | vpsrldq $8, %xmm7, %xmm0 614 | vpaddq %xmm0, %xmm7, %xmm7 615 | vpsrldq $8, %xmm12, %xmm0 616 | vpaddq %xmm0, %xmm12, %xmm12 617 | vpsrldq $8, %xmm3, %xmm0 618 | vpaddq %xmm0, %xmm3, %xmm3 619 | vpsrldq $8, %xmm6, %xmm0 620 | vpaddq %xmm0, %xmm6, %xmm6 621 | vpsrldq $8, %xmm2, %xmm0 622 | vpaddq %xmm0, %xmm2, %xmm2 623 | vmovd %xmm7, %eax 624 | vmovd %xmm12, %edx 625 | movl %eax, %r9d 626 | shrl $26, %r9d 627 | addl %edx, %r9d 628 | movl %r9d, %r8d 629 | andl $67108863, %r8d 630 | vmovd %xmm3, %edx 631 | shrl $26, %r9d 632 | addl %edx, %r9d 633 | vmovd %xmm6, %edx 634 | movl %r9d, %ecx 635 | shrl $26, %ecx 636 | addl %edx, %ecx 637 | movl %ecx, %esi 638 | andl $67108863, %esi 639 | vmovd %xmm2, %r10d 640 | movl %r8d, %r11d 641 | salq $26, %r11 642 | andl $67108863, %eax 643 | orq %rax, %r11 644 | movabsq $17592186044415, %rax 645 | andq %rax, %r11 646 | andl $67108863, %r9d 647 | salq $8, %r9 648 | shrl $18, %r8d 649 | movl %r8d, %r8d 650 | orq %r8, %r9 651 | movq %rsi, %rdx 652 | salq $34, %rdx 653 | orq %rdx, %r9 654 | andq %rax, %r9 655 | shrl $26, %ecx 656 | addl %r10d, %ecx 657 | salq $16, %rcx 658 | shrl $10, %esi 659 | movl %esi, %esi 660 | orq %rsi, %rcx 661 | movabsq $4398046511103, %r10 662 | movq %rcx, %r8 663 | andq %r10, %r8 664 | shrq $42, %rcx 665 | leaq (%rcx,%rcx,4), %rdx 666 | addq %r11, %rdx 667 | movq %rdx, %rsi 668 | andq %rax, %rsi 669 | shrq $44, %rdx 670 | addq %r9, %rdx 671 | movq %rdx, %rcx 672 | andq %rax, %rcx 673 | shrq $44, %rdx 674 | addq %r8, %rdx 675 | andq %rdx, %r10 676 | shrq $42, %rdx 677 | leaq (%rsi,%rdx,4), %rsi 678 | leaq (%rsi,%rdx), %r11 679 | movq %r11, %rbx 680 | andq %rax, %rbx 681 | shrq $44, %r11 682 | addq %rcx, %r11 683 | leaq 5(%rbx), %r9 684 | movq %r9, %r8 685 | shrq $44, %r8 686 | addq %r11, %r8 687 | movabsq $-4398046511104, %rsi 688 | addq %r10, %rsi 689 | movq %r8, %rdx 690 | shrq $44, %rdx 691 | addq %rdx, %rsi 692 | movq %rsi, %rdx 693 | shrq $63, %rdx 694 | subq $1, %rdx 695 | movq %rdx, %rcx 696 | notq %rcx 697 | andq %rcx, %rbx 698 | andq %rcx, %r11 699 | andq %r10, %rcx 700 | andq %rax, %r9 701 | andq %rdx, %r9 702 | orq %r9, %rbx 703 | movq %rbx, (%rdi) 704 | andq %r8, %rax 705 | andq %rdx, %rax 706 | orq %rax, %r11 707 | movq %r11, 8(%rdi) 708 | andq %rsi, %rdx 709 | orq %rcx, %rdx 710 | movq %rdx, 16(%rdi) 711 | .L11: 712 | movq -8(%rbp), %rbx 713 | leave 714 | ret 715 | FN_END poly1305_blocks_avx 716 | 717 | GLOBAL_HIDDEN_FN poly1305_finish_ext_avx 718 | poly1305_finish_ext_avx_local: 719 | pushq %r12 720 | pushq %rbp 721 | pushq %rbx 722 | subq $32, %rsp 723 | movq %rdi, %rbx 724 | movq %rdx, %rbp 725 | movq %rcx, %r12 726 | testq %rdx, %rdx 727 | je .L30 728 | movq $0, (%rsp) 729 | movq $0, 8(%rsp) 730 | movq $0, 16(%rsp) 731 | movq $0, 24(%rsp) 732 | movq %rsp, %rax 733 | subq %rsp, %rsi 734 | testb $16, %dl 735 | je .L31 736 | vmovdqu (%rsp,%rsi), %xmm0 737 | vmovdqa %xmm0, (%rsp) 738 | addq $16, %rax 739 | .L31: 740 | testb $8, %bpl 741 | je .L32 742 | movq (%rax,%rsi), %rdx 743 | movq %rdx, (%rax) 744 | addq $8, %rax 745 | .L32: 746 | testb $4, %bpl 747 | je .L33 748 | movl (%rax,%rsi), %edx 749 | movl %edx, (%rax) 750 | addq $4, %rax 751 | .L33: 752 | testb $2, %bpl 753 | je .L34 754 | movzwl (%rax,%rsi), %edx 755 | movw %dx, (%rax) 756 | addq $2, %rax 757 | .L34: 758 | testb $1, %bpl 759 | je .L35 760 | movzbl (%rax,%rsi), %edx 761 | movb %dl, (%rax) 762 | .L35: 763 | cmpq $16, %rbp 764 | je .L36 765 | movb $1, (%rsp,%rbp) 766 | movq 120(%rbx), %rdx 767 | cmpq $16, %rbp 768 | sbbq %rax, %rax 769 | andl $4, %eax 770 | addq $4, %rax 771 | .L37: 772 | orq %rdx, %rax 773 | movq %rax, 120(%rbx) 774 | movq %rsp, %rsi 775 | movl $32, %edx 776 | movq %rbx, %rdi 777 | call poly1305_blocks_avx_local 778 | .L30: 779 | movq 120(%rbx), %rax 780 | testb $1, %al 781 | je .L38 782 | subq $1, %rbp 783 | cmpq $15, %rbp 784 | jbe .L39 785 | orq $16, %rax 786 | movq %rax, 120(%rbx) 787 | jmp .L40 788 | .L39: 789 | orq $32, %rax 790 | movq %rax, 120(%rbx) 791 | .L40: 792 | movl $32, %edx 793 | movl $0, %esi 794 | movq %rbx, %rdi 795 | call poly1305_blocks_avx_local 796 | .L38: 797 | movq 8(%rbx), %rax 798 | movq %rax, %rdx 799 | salq $44, %rdx 800 | orq (%rbx), %rdx 801 | shrq $20, %rax 802 | movq 16(%rbx), %rcx 803 | salq $24, %rcx 804 | orq %rcx, %rax 805 | movq 104(%rbx), %rcx 806 | movq 112(%rbx), %rsi 807 | addq %rcx, %rdx 808 | adcq %rsi, %rax 809 | vpxor %xmm0, %xmm0, %xmm0 810 | vmovdqu %xmm0, (%rbx) 811 | vmovdqu %xmm0, 16(%rbx) 812 | vmovdqu %xmm0, 32(%rbx) 813 | vmovdqu %xmm0, 48(%rbx) 814 | vmovdqu %xmm0, 64(%rbx) 815 | vmovdqu %xmm0, 80(%rbx) 816 | vmovdqu %xmm0, 96(%rbx) 817 | vmovdqu %xmm0, 112(%rbx) 818 | movq %rdx, (%r12) 819 | movq %rax, 8(%r12) 820 | jmp .L43 821 | .L36: 822 | movq 120(%rbx), %rdx 823 | movl $4, %eax 824 | jmp .L37 825 | .L43: 826 | addq $32, %rsp 827 | popq %rbx 828 | popq %rbp 829 | popq %r12 830 | ret 831 | FN_END poly1305_finish_ext_avx 832 | 833 | GLOBAL_HIDDEN_FN poly1305_auth_avx 834 | cmp $128, %rdx 835 | jb poly1305_auth_x86_local 836 | pushq %rbp 837 | movq %rsp, %rbp 838 | pushq %r14 839 | pushq %r13 840 | pushq %r12 841 | pushq %rbx 842 | andq $-64, %rsp 843 | addq $-128, %rsp 844 | movq %rdi, %r14 845 | movq %rsi, %r12 846 | movq %rdx, %rbx 847 | movq %rsp, %rdi 848 | movq %rcx, %rsi 849 | call poly1305_init_ext_avx_local 850 | movq %rbx, %r13 851 | andq $-32, %r13 852 | je .L46 853 | movq %rsp, %rdi 854 | movq %r13, %rdx 855 | movq %r12, %rsi 856 | call poly1305_blocks_avx_local 857 | addq %r13, %r12 858 | subq %r13, %rbx 859 | .L46: 860 | movq %rsp, %rdi 861 | movq %r14, %rcx 862 | movq %rbx, %rdx 863 | movq %r12, %rsi 864 | call poly1305_finish_ext_avx_local 865 | leaq -32(%rbp), %rsp 866 | popq %rbx 867 | popq %r12 868 | popq %r13 869 | popq %r14 870 | popq %rbp 871 | ret 872 | FN_END poly1305_auth_avx 873 | -------------------------------------------------------------------------------- /app/extensions/poly1305/poly1305_constants_x86.inc: -------------------------------------------------------------------------------- 1 | SECTION_RODATA 2 | 3 | .p2align 4 4 | poly1305_constants_x86: 5 | /* 0 */ poly1305_x86_scale: .long 0x0,0x37f40000 6 | /* 8 */ poly1305_x86_two32: .long 0x0,0x41f00000 7 | /* 16 */ poly1305_x86_two64: .long 0x0,0x43f00000 8 | /* 24 */ poly1305_x86_two96: .long 0x0,0x45f00000 9 | /* 32 */ poly1305_x86_alpha32: .long 0x0,0x45e80000 10 | /* 40 */ poly1305_x86_alpha64: .long 0x0,0x47e80000 11 | /* 48 */ poly1305_x86_alpha96: .long 0x0,0x49e80000 12 | /* 56 */ poly1305_x86_alpha130: .long 0x0,0x4c080000 13 | /* 64 */ poly1305_x86_doffset0: .long 0x0,0x43300000 14 | /* 72 */ poly1305_x86_doffset1: .long 0x0,0x45300000 15 | /* 80 */ poly1305_x86_doffset2: .long 0x0,0x47300000 16 | /* 88 */ poly1305_x86_doffset3: .long 0x0,0x49300000 17 | /* 96 */ poly1305_x86_doffset3minustwo128: .long 0x0,0x492ffffe 18 | /* 104 */ poly1305_x86_hoffset0: .long 0xfffffffb,0x43300001 19 | /* 112 */ poly1305_x86_hoffset1: .long 0xfffffffe,0x45300001 20 | /* 120 */ poly1305_x86_hoffset2: .long 0xfffffffe,0x47300001 21 | /* 124 */ poly1305_x86_hoffset3: .long 0xfffffffe,0x49300003 22 | -------------------------------------------------------------------------------- /app/extensions/poly1305/poly1305_neon-32.inc: -------------------------------------------------------------------------------- 1 | SECTION_TEXT 2 | 3 | .arm 4 | .fpu neon 5 | 6 | GLOBAL_HIDDEN_FN poly1305_block_size_neon 7 | mov r0, #32 8 | bx lr 9 | FN_END poly1305_block_size_neon 10 | 11 | .p2align 2 12 | poly1305_init_constants_neon: 13 | .long 0x3ffff03 14 | .long 0x3ffc0ff 15 | .long 0x3f03fff 16 | .long 0x00fffff 17 | 18 | GLOBAL_HIDDEN_FN poly1305_init_ext_neon 19 | poly1305_init_ext_neon_local: 20 | stmfd sp!, {r4-r11, lr} 21 | sub sp, sp, #32 22 | mov r14, r2 23 | and r2, r2, r2 24 | moveq r14, #-1 25 | ldmia r1!, {r2-r5} 26 | ldr r7, =poly1305_init_constants_neon 27 | mov r6, r2 28 | mov r8, r2, lsr #26 29 | mov r9, r3, lsr #20 30 | mov r10, r4, lsr #14 31 | mov r11, r5, lsr #8 32 | orr r8, r8, r3, lsl #6 33 | orr r9, r9, r4, lsl #12 34 | orr r10, r10, r5, lsl #18 35 | ldmia r7, {r2-r5} 36 | and r2, r2, r8 37 | and r3, r3, r9 38 | and r4, r4, r10 39 | and r5, r5, r11 40 | and r6, r6, 0x3ffffff 41 | stmia r0!, {r2-r6} 42 | eor r8, r8, r8 43 | str r8, [sp, #24] 44 | poly1305_init_ext_neon_squareloop: 45 | ldr r8, [sp, #24] 46 | mov r12, #16 47 | cmp r8, #2 48 | beq poly1305_init_ext_neon_donesquaring 49 | cmp r8, #1 50 | moveq r12, #64 51 | cmp r14, r12 52 | bls poly1305_init_ext_neon_donesquaring 53 | add r8, #1 54 | str r8, [sp, #24] 55 | mov r6, r6, lsl #1 56 | mov r2, r2, lsl #1 57 | umull r7, r8, r3, r3 58 | umull r9, r10, r6, r4 59 | umlal r7, r8, r6, r5 60 | umlal r9, r10, r2, r3 61 | add r11, r5, r5, lsl #2 62 | umlal r7, r8, r2, r4 63 | umlal r9, r10, r5, r11 64 | str r7, [sp, #16] 65 | str r8, [sp, #20] 66 | mov r2, r2, lsr #1 67 | mov r5, r5, lsl #1 68 | str r9, [sp, #8] 69 | str r10, [sp, #12] 70 | umull r7, r8, r2, r2 71 | umull r9, r10, r6, r2 72 | add r11, r3, r3, lsl #2 73 | add r12, r4, r4, lsl #2 74 | umlal r7, r8, r6, r3 75 | umlal r9, r10, r5, r11 76 | umlal r7, r8, r5, r12 77 | umlal r9, r10, r4, r12 78 | mov r6, r6, lsr #1 79 | mov r3, r3, lsl #1 80 | add r11, r2, r2, lsl #2 81 | str r7, [sp, #0] 82 | str r8, [sp, #4] 83 | umull r7, r8, r6, r6 84 | umlal r7, r8, r3, r12 85 | umlal r7, r8, r5, r11 86 | and r6, r7, 0x3ffffff 87 | mov r11, r7, lsr #26 88 | orr r11, r11, r8, lsl #6 89 | ldr r7, [sp, #0] 90 | ldr r8, [sp, #4] 91 | adds r9, r9, r11 92 | adc r10, r10, #0 93 | and r2, r9, 0x3ffffff 94 | mov r11, r9, lsr #26 95 | orr r11, r11, r10, lsl #6 96 | ldr r9, [sp, #8] 97 | ldr r10, [sp, #12] 98 | adds r7, r7, r11 99 | adc r8, r8, #0 100 | and r3, r7, 0x3ffffff 101 | mov r11, r7, lsr #26 102 | orr r11, r11, r8, lsl #6 103 | ldr r7, [sp, #16] 104 | ldr r8, [sp, #20] 105 | adds r9, r9, r11 106 | adc r10, r10, #0 107 | and r4, r9, 0x3ffffff 108 | mov r11, r9, lsr #26 109 | orr r11, r11, r10, lsl #6 110 | adds r7, r7, r11 111 | adc r8, r8, #0 112 | and r5, r7, 0x3ffffff 113 | mov r11, r7, lsr #26 114 | orr r11, r11, r8, lsl #6 115 | add r11, r11, r11, lsl #2 116 | add r6, r6, r11 117 | mov r11, r6, lsr #26 118 | and r6, r6, 0x3ffffff 119 | add r2, r2, r11 120 | stmia r0!, {r2-r6} 121 | b poly1305_init_ext_neon_squareloop 122 | poly1305_init_ext_neon_donesquaring: 123 | mov r2, #2 124 | ldr r14, [sp, #24] 125 | sub r14, r2, r14 126 | mov r3, r14, lsl #4 127 | add r3, r3, r14, lsl #2 128 | add r0, r0, r3 129 | eor r2, r2, r2 130 | eor r3, r3, r3 131 | eor r4, r4, r4 132 | eor r5, r5, r5 133 | eor r6, r6, r6 134 | stmia r0!, {r2-r6} 135 | stmia r0!, {r2-r6} 136 | ldmia r1!, {r2-r5} 137 | stmia r0, {r2-r6} 138 | add sp, sp, #32 139 | ldmfd sp!, {r4-r11, lr} 140 | bx lr 141 | FN_END poly1305_init_ext_neon 142 | 143 | .ltorg 144 | 145 | GLOBAL_HIDDEN_FN poly1305_blocks_neon 146 | poly1305_blocks_neon_local: 147 | vmov.i32 q0, #0xffffffff 148 | vmov.i32 d4, #1 149 | vsubw.u32 q0, q0, d4 150 | vstmdb sp!, {q4,q5,q6,q7} 151 | stmfd sp!, {r4-r11, lr} 152 | mov r8, sp 153 | and sp, sp, #~63 154 | sub sp, sp, #192 155 | str r0, [sp, #108] 156 | str r1, [sp, #112] 157 | str r2, [sp, #116] 158 | str r8, [sp, #120] 159 | mov r3, r0 160 | mov r0, r1 161 | mov r1, r2 162 | mov r2, r3 163 | ldr r8, [r2, #116] 164 | veor d15, d15, d15 165 | vorr.i32 d15, #(1 << 24) 166 | tst r8, #2 167 | beq poly1305_blocks_neon_skip_shift8 168 | vshr.u64 d15, #32 169 | poly1305_blocks_neon_skip_shift8: 170 | tst r8, #4 171 | beq poly1305_blocks_neon_skip_shift16 172 | veor d15, d15, d15 173 | poly1305_blocks_neon_skip_shift16: 174 | vst1.64 d15, [sp, :64] 175 | tst r8, #1 176 | bne poly1305_blocks_neon_started 177 | vld1.64 {q0-q1}, [r0]! 178 | vswp d1, d2 179 | vmovn.i64 d21, q0 180 | vshrn.i64 d22, q0, #26 181 | vshrn.u64 d24, q1, #14 182 | vext.8 d0, d0, d2, #4 183 | vext.8 d1, d1, d3, #4 184 | vshr.u64 q1, q1, #32 185 | vshrn.i64 d23, q0, #20 186 | vshrn.u64 d25, q1, #8 187 | vand.i32 d21, #0x03ffffff 188 | vand.i32 q11, #0x03ffffff 189 | vand.i32 q12, #0x03ffffff 190 | orr r8, r8, #1 191 | sub r1, r1, #32 192 | str r8, [r2, #116] 193 | vorr d25, d25, d15 194 | b poly1305_blocks_neon_setupr20 195 | poly1305_blocks_neon_started: 196 | add r9, r2, #60 197 | vldm r9, {d21-d25} 198 | poly1305_blocks_neon_setupr20: 199 | vmov.i32 d0, #5 200 | tst r8, #(8|16) 201 | beq poly1305_blocks_neon_setupr20_simple 202 | tst r8, #(8) 203 | beq poly1305_blocks_neon_setupr20_r_1 204 | mov r9, r2 205 | add r10, r2, #20 206 | vld1.64 {q9}, [r9]! 207 | vld1.64 {q8}, [r10]! 208 | vld1.64 {d2}, [r9] 209 | vld1.64 {d20}, [r10] 210 | b poly1305_blocks_neon_setupr20_hard 211 | poly1305_blocks_neon_setupr20_r_1: 212 | mov r9, r2 213 | vmov.i32 d2, #1 214 | vld1.64 {q8}, [r9]! 215 | veor q9, q9, q9 216 | vshr.u64 d2, d2, #32 217 | vld1.64 {d20}, [r9] 218 | poly1305_blocks_neon_setupr20_hard: 219 | vzip.i32 q8, q9 220 | vzip.i32 d20, d2 221 | b poly1305_blocks_neon_setups20 222 | poly1305_blocks_neon_setupr20_simple: 223 | add r9, r2, #20 224 | vld1.64 {d2-d4}, [r9] 225 | vdup.32 d16, d2[0] 226 | vdup.32 d17, d2[1] 227 | vdup.32 d18, d3[0] 228 | vdup.32 d19, d3[1] 229 | vdup.32 d20, d4[0] 230 | poly1305_blocks_neon_setups20: 231 | vmul.i32 q13, q8, d0[0] 232 | vmov.i64 q15, 0x00000000ffffffff 233 | vmul.i32 q14, q9, d0[0] 234 | vshr.u64 q15, q15, #6 235 | cmp r1, #64 236 | blo poly1305_blocks_neon_try32 237 | add r9, sp, #16 238 | add r10, r2, #40 239 | add r11, sp, #64 240 | str r1, [sp, #116] 241 | vld1.64 {d10-d12}, [r10] 242 | vmov d14, d12 243 | vmul.i32 q6, q5, d0[0] 244 | poly1305_blocks_neon_mainloop: 245 | ldmia r0!, {r2-r5} 246 | vmull.u32 q0, d25, d12[0] 247 | mov r7, r2, lsr #26 248 | vmlal.u32 q0, d24, d12[1] 249 | mov r8, r3, lsr #20 250 | ldr r6, [sp, #0] 251 | vmlal.u32 q0, d23, d13[0] 252 | mov r9, r4, lsr #14 253 | vmlal.u32 q0, d22, d13[1] 254 | orr r6, r6, r5, lsr #8 255 | vmlal.u32 q0, d21, d14[0] 256 | orr r3, r7, r3, lsl #6 257 | vmull.u32 q1, d25, d12[1] 258 | orr r4, r8, r4, lsl #12 259 | orr r5, r9, r5, lsl #18 260 | vmlal.u32 q1, d24, d13[0] 261 | ldmia r0!, {r7-r10} 262 | vmlal.u32 q1, d23, d13[1] 263 | mov r1, r7, lsr #26 264 | vmlal.u32 q1, d22, d14[0] 265 | ldr r11, [sp, #4] 266 | mov r12, r8, lsr #20 267 | vmlal.u32 q1, d21, d10[0] 268 | mov r14, r9, lsr #14 269 | vmull.u32 q2, d25, d13[0] 270 | orr r11, r11, r10, lsr #8 271 | orr r8, r1, r8, lsl #6 272 | vmlal.u32 q2, d24, d13[1] 273 | orr r9, r12, r9, lsl #12 274 | vmlal.u32 q2, d23, d14[0] 275 | orr r10, r14, r10, lsl #18 276 | vmlal.u32 q2, d22, d10[0] 277 | mov r12, r3 278 | and r2, r2, #0x3ffffff 279 | vmlal.u32 q2, d21, d10[1] 280 | mov r14, r5 281 | vmull.u32 q3, d25, d13[1] 282 | and r3, r7, #0x3ffffff 283 | vmlal.u32 q3, d24, d14[0] 284 | and r5, r8, #0x3ffffff 285 | vmlal.u32 q3, d23, d10[0] 286 | and r7, r9, #0x3ffffff 287 | vmlal.u32 q3, d22, d10[1] 288 | and r8, r14, #0x3ffffff 289 | vmlal.u32 q3, d21, d11[0] 290 | and r9, r10, #0x3ffffff 291 | add r14, sp, #128 292 | vmull.u32 q4, d25, d14[0] 293 | mov r10, r6 294 | vmlal.u32 q4, d24, d10[0] 295 | and r6, r4, #0x3ffffff 296 | vmlal.u32 q4, d23, d10[1] 297 | and r4, r12, #0x3ffffff 298 | vmlal.u32 q4, d22, d11[0] 299 | stm r14, {r2-r11} 300 | vmlal.u32 q4, d21, d11[1] 301 | vld1.64 {d21-d24}, [r14, :256]! 302 | vld1.64 {d25}, [r14, :64] 303 | ldmia r0!, {r2-r5} 304 | vmlal.u32 q0, d25, d26 305 | mov r7, r2, lsr #26 306 | vmlal.u32 q0, d24, d27 307 | ldr r6, [sp, #0] 308 | mov r8, r3, lsr #20 309 | vmlal.u32 q0, d23, d28 310 | mov r9, r4, lsr #14 311 | vmlal.u32 q0, d22, d29 312 | orr r6, r6, r5, lsr #8 313 | vmlal.u32 q0, d21, d20 314 | orr r3, r7, r3, lsl #6 315 | vmlal.u32 q1, d25, d27 316 | orr r4, r8, r4, lsl #12 317 | orr r5, r9, r5, lsl #18 318 | vmlal.u32 q1, d24, d28 319 | ldmia r0!, {r7-r10} 320 | vmlal.u32 q1, d23, d29 321 | mov r1, r7, lsr #26 322 | vmlal.u32 q1, d22, d20 323 | ldr r11, [sp, #4] 324 | mov r12, r8, lsr #20 325 | vmlal.u32 q1, d21, d16 326 | mov r14, r9, lsr #14 327 | vmlal.u32 q2, d25, d28 328 | orr r11, r11, r10, lsr #8 329 | orr r8, r1, r8, lsl #6 330 | orr r9, r12, r9, lsl #12 331 | vmlal.u32 q2, d24, d29 332 | orr r10, r14, r10, lsl #18 333 | and r2, r2, #0x3ffffff 334 | mov r12, r3 335 | vmlal.u32 q2, d23, d20 336 | mov r14, r5 337 | vmlal.u32 q2, d22, d16 338 | and r3, r7, #0x3ffffff 339 | vmlal.u32 q2, d21, d17 340 | and r5, r8, #0x3ffffff 341 | vmlal.u32 q3, d25, d29 342 | and r7, r9, #0x3ffffff 343 | vmlal.u32 q3, d24, d20 344 | and r8, r14, #0x3ffffff 345 | vmlal.u32 q3, d23, d16 346 | and r9, r10, #0x3ffffff 347 | vmlal.u32 q3, d22, d17 348 | add r14, sp, #128 349 | vmlal.u32 q3, d21, d18 350 | mov r10, r6 351 | vmlal.u32 q4, d25, d20 352 | vmlal.u32 q4, d24, d16 353 | and r6, r4, #0x3ffffff 354 | vmlal.u32 q4, d23, d17 355 | and r4, r12, #0x3ffffff 356 | vmlal.u32 q4, d22, d18 357 | stm r14, {r2-r11} 358 | vmlal.u32 q4, d21, d19 359 | vld1.64 {d21-d24}, [r14, :256]! 360 | vld1.64 {d25}, [r14, :64] 361 | vaddw.u32 q0, q0, d21 362 | vaddw.u32 q1, q1, d22 363 | vaddw.u32 q2, q2, d23 364 | vaddw.u32 q3, q3, d24 365 | vaddw.u32 q4, q4, d25 366 | vshr.u64 q11, q0, #26 367 | vand q0, q0, q15 368 | vadd.i64 q1, q1, q11 369 | vshr.u64 q12, q3, #26 370 | vand q3, q3, q15 371 | vadd.i64 q4, q4, q12 372 | vshr.u64 q11, q1, #26 373 | vand q1, q1, q15 374 | vadd.i64 q2, q2, q11 375 | vshr.u64 q12, q4, #26 376 | vand q4, q4, q15 377 | vadd.i64 q0, q0, q12 378 | vshl.i64 q12, q12, #2 379 | ldr r1, [sp, #116] 380 | vadd.i64 q0, q0, q12 381 | vshr.u64 q11, q2, #26 382 | vand q2, q2, q15 383 | vadd.i64 q3, q3, q11 384 | sub r1, #64 385 | vshr.u64 q12, q0, #26 386 | vand q0, q0, q15 387 | vadd.i64 q1, q1, q12 388 | cmp r1, #64 389 | vshr.u64 q11, q3, #26 390 | vand q3, q3, q15 391 | vadd.i64 q4, q4, q11 392 | vmovn.i64 d21, q0 393 | str r1, [sp, #116] 394 | vmovn.i64 d22, q1 395 | vmovn.i64 d23, q2 396 | vmovn.i64 d24, q3 397 | vmovn.i64 d25, q4 398 | bhs poly1305_blocks_neon_mainloop 399 | poly1305_blocks_neon_try32: 400 | cmp r1, #32 401 | blo poly1305_blocks_neon_done 402 | tst r0, r0 403 | bne poly1305_blocks_loadm32 404 | veor q0, q0, q0 405 | veor q1, q1, q1 406 | veor q2, q2, q2 407 | veor q3, q3, q3 408 | veor q4, q4, q4 409 | b poly1305_blocks_continue32 410 | poly1305_blocks_loadm32: 411 | vld1.64 {q0-q1}, [r0]! 412 | veor q4, q4, q4 413 | vswp d1, d2 414 | veor q3, q3, q3 415 | vtrn.32 q0, q4 416 | vtrn.32 q1, q3 417 | vshl.i64 q2, q1, #12 418 | vshl.i64 q3, q3, #18 419 | vshl.i64 q1, q4, #6 420 | vmovl.u32 q4, d15 421 | poly1305_blocks_continue32: 422 | vmlal.u32 q0, d25, d26 423 | vmlal.u32 q0, d24, d27 424 | vmlal.u32 q0, d23, d28 425 | vmlal.u32 q0, d22, d29 426 | vmlal.u32 q0, d21, d20 427 | vmlal.u32 q1, d25, d27 428 | vmlal.u32 q1, d24, d28 429 | vmlal.u32 q1, d23, d29 430 | vmlal.u32 q1, d22, d20 431 | vmlal.u32 q1, d21, d16 432 | vmlal.u32 q2, d25, d28 433 | vmlal.u32 q2, d24, d29 434 | vmlal.u32 q2, d23, d20 435 | vmlal.u32 q2, d22, d16 436 | vmlal.u32 q2, d21, d17 437 | vmlal.u32 q3, d25, d29 438 | vmlal.u32 q3, d24, d20 439 | vmlal.u32 q3, d23, d16 440 | vmlal.u32 q3, d22, d17 441 | vmlal.u32 q3, d21, d18 442 | vmlal.u32 q4, d25, d20 443 | vmlal.u32 q4, d24, d16 444 | vmlal.u32 q4, d23, d17 445 | vmlal.u32 q4, d22, d18 446 | vmlal.u32 q4, d21, d19 447 | vshr.u64 q11, q0, #26 448 | vand q0, q0, q15 449 | vadd.i64 q1, q1, q11 450 | vshr.u64 q12, q3, #26 451 | vand q3, q3, q15 452 | vadd.i64 q4, q4, q12 453 | vshr.u64 q11, q1, #26 454 | vand q1, q1, q15 455 | vadd.i64 q2, q2, q11 456 | vshr.u64 q12, q4, #26 457 | vand q4, q4, q15 458 | vadd.i64 q0, q0, q12 459 | vshl.i64 q12, q12, #2 460 | vadd.i64 q0, q0, q12 461 | vshr.u64 q11, q2, #26 462 | vand q2, q2, q15 463 | vadd.i64 q3, q3, q11 464 | vshr.u64 q12, q0, #26 465 | vand q0, q0, q15 466 | vadd.i64 q1, q1, q12 467 | vshr.u64 q11, q3, #26 468 | vand q3, q3, q15 469 | vadd.i64 q4, q4, q11 470 | vmovn.i64 d21, q0 471 | vmovn.i64 d22, q1 472 | vmovn.i64 d23, q2 473 | vmovn.i64 d24, q3 474 | vmovn.i64 d25, q4 475 | poly1305_blocks_neon_done: 476 | tst r0, r0 477 | beq poly1305_blocks_neon_final 478 | ldr r2, [sp, #108] 479 | add r2, r2, #60 480 | vst1.64 {d21}, [r2]! 481 | vst1.64 {d22-d25}, [r2] 482 | b poly1305_blocks_neon_leave 483 | poly1305_blocks_neon_final: 484 | vadd.u32 d10, d0, d1 485 | vadd.u32 d13, d2, d3 486 | vadd.u32 d11, d4, d5 487 | ldr r5, [sp, #108] 488 | vadd.u32 d14, d6, d7 489 | vadd.u32 d12, d8, d9 490 | vtrn.32 d10, d13 491 | vtrn.32 d11, d14 492 | vst1.64 {d10-d12}, [sp] 493 | ldm sp, {r0-r4} 494 | mov r12, r0, lsr #26 495 | and r0, r0, #0x3ffffff 496 | add r1, r1, r12 497 | mov r12, r1, lsr #26 498 | and r1, r1, #0x3ffffff 499 | add r2, r2, r12 500 | mov r12, r2, lsr #26 501 | and r2, r2, #0x3ffffff 502 | add r3, r3, r12 503 | mov r12, r3, lsr #26 504 | and r3, r3, #0x3ffffff 505 | add r4, r4, r12 506 | mov r12, r4, lsr #26 507 | and r4, r4, #0x3ffffff 508 | add r12, r12, r12, lsl #2 509 | add r0, r0, r12 510 | mov r12, r0, lsr #26 511 | and r0, r0, #0x3ffffff 512 | add r1, r1, r12 513 | mov r12, r1, lsr #26 514 | and r1, r1, #0x3ffffff 515 | add r2, r2, r12 516 | mov r12, r2, lsr #26 517 | and r2, r2, #0x3ffffff 518 | add r3, r3, r12 519 | mov r12, r3, lsr #26 520 | and r3, r3, #0x3ffffff 521 | add r4, r4, r12 522 | mov r12, r4, lsr #26 523 | and r4, r4, #0x3ffffff 524 | add r12, r12, r12, lsl #2 525 | add r0, r0, r12 526 | mov r12, r0, lsr #26 527 | and r0, r0, #0x3ffffff 528 | add r1, r1, r12 529 | add r6, r0, #5 530 | mov r12, r6, lsr #26 531 | and r6, r6, #0x3ffffff 532 | add r7, r1, r12 533 | mov r12, r7, lsr #26 534 | and r7, r7, #0x3ffffff 535 | add r10, r2, r12 536 | mov r12, r10, lsr #26 537 | and r10, r10, #0x3ffffff 538 | add r11, r3, r12 539 | mov r12, #-(1 << 26) 540 | add r12, r12, r11, lsr #26 541 | and r11, r11, #0x3ffffff 542 | add r14, r4, r12 543 | mov r12, r14, lsr #31 544 | sub r12, #1 545 | and r6, r6, r12 546 | and r7, r7, r12 547 | and r10, r10, r12 548 | and r11, r11, r12 549 | and r14, r14, r12 550 | mvn r12, r12 551 | and r0, r0, r12 552 | and r1, r1, r12 553 | and r2, r2, r12 554 | and r3, r3, r12 555 | and r4, r4, r12 556 | orr r0, r0, r6 557 | orr r1, r1, r7 558 | orr r2, r2, r10 559 | orr r3, r3, r11 560 | orr r4, r4, r14 561 | orr r0, r0, r1, lsl #26 562 | lsr r1, r1, #6 563 | orr r1, r1, r2, lsl #20 564 | lsr r2, r2, #12 565 | orr r2, r2, r3, lsl #14 566 | lsr r3, r3, #18 567 | orr r3, r3, r4, lsl #8 568 | add r5, r5, #60 569 | stm r5, {r0-r3} 570 | poly1305_blocks_neon_leave: 571 | ldr sp, [sp, #120] 572 | ldmfd sp!, {r4-r11, lr} 573 | vldm sp!, {q4-q7} 574 | bx lr 575 | FN_END poly1305_init_ext_neon 576 | 577 | GLOBAL_HIDDEN_FN poly1305_finish_ext_neon 578 | poly1305_finish_ext_neon_local: 579 | stmfd sp!, {r4-r11, lr} 580 | sub sp, sp, #32 581 | mov r5, r0 582 | mov r6, r1 583 | mov r7, r2 584 | mov r8, r3 585 | ands r7, r7, r7 586 | beq poly1305_finish_ext_neon_noremaining 587 | mov r9, sp 588 | veor q0, q0, q0 589 | veor q1, q1, q1 590 | vst1.64 {q0-q1}, [sp] 591 | tst r7, #16 592 | beq poly1305_finish_ext_neon_skip16 593 | vld1.u64 {q0}, [r1]! 594 | vst1.64 {q0}, [r9]! 595 | poly1305_finish_ext_neon_skip16: 596 | tst r7, #8 597 | beq poly1305_finish_ext_neon_skip8 598 | ldmia r1!, {r10-r11} 599 | stmia r9!, {r10-r11} 600 | poly1305_finish_ext_neon_skip8: 601 | tst r7, #4 602 | beq poly1305_finish_ext_neon_skip4 603 | ldr r10, [r1], #4 604 | str r10, [r9], #4 605 | poly1305_finish_ext_neon_skip4: 606 | tst r7, #2 607 | beq poly1305_finish_ext_neon_skip2 608 | ldrh r10, [r1], #2 609 | strh r10, [r9], #2 610 | poly1305_finish_ext_neon_skip2: 611 | tst r7, #1 612 | beq poly1305_finish_ext_neon_skip1 613 | ldrb r10, [r1], #1 614 | strb r10, [r9], #1 615 | poly1305_finish_ext_neon_skip1: 616 | cmp r7, #16 617 | beq poly1305_finish_ext_neon_skipfinalbit 618 | mov r10, #1 619 | strb r10, [r9] 620 | poly1305_finish_ext_neon_skipfinalbit: 621 | ldr r10, [r5, #116] 622 | orrhs r10, #2 623 | orrlo r10, #4 624 | str r10, [r5, #116] 625 | mov r0, r5 626 | mov r1, sp 627 | mov r2, #32 628 | bl poly1305_blocks_neon_local 629 | poly1305_finish_ext_neon_noremaining: 630 | ldr r10, [r5, #116] 631 | tst r10, #1 632 | beq poly1305_finish_ext_neon_notstarted 633 | cmp r7, #0 634 | beq poly1305_finish_ext_neon_user2r 635 | cmp r7, #16 636 | bls poly1305_finish_ext_neon_user1 637 | poly1305_finish_ext_neon_user2r: 638 | orr r10, r10, #8 639 | b poly1305_finish_ext_neon_finalblock 640 | poly1305_finish_ext_neon_user1: 641 | orr r10, r10, #16 642 | poly1305_finish_ext_neon_finalblock: 643 | str r10, [r5, #116] 644 | mov r0, r5 645 | eor r1, r1, r1 646 | mov r2, #32 647 | bl poly1305_blocks_neon_local 648 | poly1305_finish_ext_neon_notstarted: 649 | add r0, r5, #60 650 | add r9, r5, #100 651 | ldm r0, {r0-r3} 652 | ldm r9, {r9-r12} 653 | adds r0, r0, r9 654 | adcs r1, r1, r10 655 | adcs r2, r2, r11 656 | adcs r3, r3, r12 657 | stm r8, {r0-r3} 658 | veor q0, q0, q0 659 | veor q1, q1, q1 660 | veor q2, q2, q2 661 | veor q3, q3, q3 662 | vstmia r5!, {q0-q3} 663 | vstm r5, {q0-q3} 664 | add sp, sp, #32 665 | ldmfd sp!, {r4-r11, lr} 666 | bx lr 667 | FN_END poly1305_finish_ext_neon 668 | 669 | GLOBAL_HIDDEN_FN poly1305_auth_neon 670 | cmp r2, #128 671 | blo poly1305_auth_armv6_local 672 | stmfd sp!, {r4-r8, lr} 673 | mov r8, sp 674 | and sp, sp, #(~63) 675 | sub sp, sp, #128 676 | mov r4, r0 677 | mov r5, r1 678 | mov r6, r2 679 | mov r7, r3 680 | mov r0, sp 681 | mov r1, r7 682 | bl poly1305_init_ext_neon_local 683 | ands r2, r6, #(~31) 684 | beq poly1305_auth_neon_noblocks 685 | mov r0, sp 686 | mov r1, r5 687 | add r5, r5, r2 688 | sub r6, r6, r2 689 | bl poly1305_blocks_neon_local 690 | poly1305_auth_neon_noblocks: 691 | mov r0, sp 692 | mov r1, r5 693 | mov r2, r6 694 | mov r3, r4 695 | bl poly1305_finish_ext_neon_local 696 | mov sp, r8 697 | ldmfd sp!, {r4-r8, lr} 698 | bx lr 699 | FN_END poly1305_auth_neon 700 | -------------------------------------------------------------------------------- /app/extensions/poly1305/poly1305_ref-32.inc: -------------------------------------------------------------------------------- 1 | /* 2 | poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition 3 | 4 | assumes the existence of uint32_t and uint64_t 5 | */ 6 | 7 | enum { 8 | POLY1305_BLOCK_SIZE = 16 9 | }; 10 | 11 | typedef struct poly1305_state_ref_t { 12 | uint32_t r[5]; 13 | uint32_t h[5]; 14 | uint32_t pad[4]; 15 | unsigned char final; 16 | } poly1305_state_ref_t; 17 | 18 | /* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */ 19 | static uint32_t 20 | U8TO32(const unsigned char *p) { 21 | return 22 | (((uint32_t)(p[0] & 0xff) ) | 23 | ((uint32_t)(p[1] & 0xff) << 8) | 24 | ((uint32_t)(p[2] & 0xff) << 16) | 25 | ((uint32_t)(p[3] & 0xff) << 24)); 26 | } 27 | 28 | /* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */ 29 | static void 30 | U32TO8(unsigned char *p, uint32_t v) { 31 | p[0] = (unsigned char)((v ) & 0xff); 32 | p[1] = (unsigned char)((v >> 8) & 0xff); 33 | p[2] = (unsigned char)((v >> 16) & 0xff); 34 | p[3] = (unsigned char)((v >> 24) & 0xff); 35 | } 36 | 37 | static size_t 38 | poly1305_block_size_ref(void) { 39 | return POLY1305_BLOCK_SIZE; 40 | } 41 | 42 | static void 43 | poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) { 44 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; 45 | 46 | /* bytes_hint not used */ 47 | (void)bytes_hint; 48 | 49 | /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 50 | st->r[0] = (U8TO32(&key->b[ 0]) ) & 0x3ffffff; 51 | st->r[1] = (U8TO32(&key->b[ 3]) >> 2) & 0x3ffff03; 52 | st->r[2] = (U8TO32(&key->b[ 6]) >> 4) & 0x3ffc0ff; 53 | st->r[3] = (U8TO32(&key->b[ 9]) >> 6) & 0x3f03fff; 54 | st->r[4] = (U8TO32(&key->b[12]) >> 8) & 0x00fffff; 55 | 56 | /* h = 0 */ 57 | st->h[0] = 0; 58 | st->h[1] = 0; 59 | st->h[2] = 0; 60 | st->h[3] = 0; 61 | st->h[4] = 0; 62 | 63 | /* save pad for later */ 64 | st->pad[0] = U8TO32(&key->b[16]); 65 | st->pad[1] = U8TO32(&key->b[20]); 66 | st->pad[2] = U8TO32(&key->b[24]); 67 | st->pad[3] = U8TO32(&key->b[28]); 68 | 69 | st->final = 0; 70 | } 71 | 72 | static void 73 | poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) { 74 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; 75 | const uint32_t hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */ 76 | uint32_t r0,r1,r2,r3,r4; 77 | uint32_t s1,s2,s3,s4; 78 | uint32_t h0,h1,h2,h3,h4; 79 | uint64_t d0,d1,d2,d3,d4; 80 | uint32_t c; 81 | 82 | r0 = st->r[0]; 83 | r1 = st->r[1]; 84 | r2 = st->r[2]; 85 | r3 = st->r[3]; 86 | r4 = st->r[4]; 87 | 88 | s1 = r1 * 5; 89 | s2 = r2 * 5; 90 | s3 = r3 * 5; 91 | s4 = r4 * 5; 92 | 93 | h0 = st->h[0]; 94 | h1 = st->h[1]; 95 | h2 = st->h[2]; 96 | h3 = st->h[3]; 97 | h4 = st->h[4]; 98 | 99 | while (inlen >= POLY1305_BLOCK_SIZE) { 100 | /* h += m[i] */ 101 | h0 += (U8TO32(in+ 0) ) & 0x3ffffff; 102 | h1 += (U8TO32(in+ 3) >> 2) & 0x3ffffff; 103 | h2 += (U8TO32(in+ 6) >> 4) & 0x3ffffff; 104 | h3 += (U8TO32(in+ 9) >> 6) & 0x3ffffff; 105 | h4 += (U8TO32(in+12) >> 8) | hibit; 106 | 107 | /* h *= r */ 108 | d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + ((uint64_t)h4 * s1); 109 | d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + ((uint64_t)h4 * s2); 110 | d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + ((uint64_t)h4 * s3); 111 | d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + ((uint64_t)h4 * s4); 112 | d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + ((uint64_t)h4 * r0); 113 | 114 | /* (partial) h %= p */ 115 | c = (uint32_t)(d0 >> 26); h0 = (uint32_t)d0 & 0x3ffffff; 116 | d1 += c; c = (uint32_t)(d1 >> 26); h1 = (uint32_t)d1 & 0x3ffffff; 117 | d2 += c; c = (uint32_t)(d2 >> 26); h2 = (uint32_t)d2 & 0x3ffffff; 118 | d3 += c; c = (uint32_t)(d3 >> 26); h3 = (uint32_t)d3 & 0x3ffffff; 119 | d4 += c; c = (uint32_t)(d4 >> 26); h4 = (uint32_t)d4 & 0x3ffffff; 120 | h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff; 121 | h1 += c; 122 | 123 | in += POLY1305_BLOCK_SIZE; 124 | inlen -= POLY1305_BLOCK_SIZE; 125 | } 126 | 127 | st->h[0] = h0; 128 | st->h[1] = h1; 129 | st->h[2] = h2; 130 | st->h[3] = h3; 131 | st->h[4] = h4; 132 | } 133 | 134 | static void 135 | poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) { 136 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; 137 | uint32_t h0,h1,h2,h3,h4,c; 138 | uint32_t g0,g1,g2,g3,g4; 139 | uint64_t f; 140 | uint32_t mask; 141 | 142 | /* process the remaining block */ 143 | if (remaining) { 144 | unsigned char final[POLY1305_BLOCK_SIZE] = {0}; 145 | size_t i; 146 | for (i = 0; i < remaining; i++) 147 | final[i] = in[i]; 148 | final[remaining] = 1; 149 | st->final = 1; 150 | poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE); 151 | } 152 | 153 | /* fully carry h */ 154 | h0 = st->h[0]; 155 | h1 = st->h[1]; 156 | h2 = st->h[2]; 157 | h3 = st->h[3]; 158 | h4 = st->h[4]; 159 | 160 | c = h1 >> 26; h1 = h1 & 0x3ffffff; 161 | h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff; 162 | h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff; 163 | h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff; 164 | h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff; 165 | h1 += c; 166 | 167 | /* compute h + -p */ 168 | g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff; 169 | g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff; 170 | g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff; 171 | g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff; 172 | g4 = h4 + c - (1 << 26); 173 | 174 | /* select h if h < p, or h + -p if h >= p */ 175 | mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1; 176 | g0 &= mask; 177 | g1 &= mask; 178 | g2 &= mask; 179 | g3 &= mask; 180 | g4 &= mask; 181 | mask = ~mask; 182 | h0 = (h0 & mask) | g0; 183 | h1 = (h1 & mask) | g1; 184 | h2 = (h2 & mask) | g2; 185 | h3 = (h3 & mask) | g3; 186 | h4 = (h4 & mask) | g4; 187 | 188 | /* h = h % (2^128) */ 189 | h0 = ((h0 ) | (h1 << 26)) & 0xffffffff; 190 | h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; 191 | h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; 192 | h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; 193 | 194 | /* mac = (h + pad) % (2^128) */ 195 | f = (uint64_t)h0 + st->pad[0] ; h0 = (uint32_t)f; 196 | f = (uint64_t)h1 + st->pad[1] + (f >> 32); h1 = (uint32_t)f; 197 | f = (uint64_t)h2 + st->pad[2] + (f >> 32); h2 = (uint32_t)f; 198 | f = (uint64_t)h3 + st->pad[3] + (f >> 32); h3 = (uint32_t)f; 199 | 200 | U32TO8(mac + 0, h0); 201 | U32TO8(mac + 4, h1); 202 | U32TO8(mac + 8, h2); 203 | U32TO8(mac + 12, h3); 204 | 205 | /* zero out the state */ 206 | st->h[0] = 0; 207 | st->h[1] = 0; 208 | st->h[2] = 0; 209 | st->h[3] = 0; 210 | st->h[4] = 0; 211 | st->r[0] = 0; 212 | st->r[1] = 0; 213 | st->r[2] = 0; 214 | st->r[3] = 0; 215 | st->r[4] = 0; 216 | st->pad[0] = 0; 217 | st->pad[1] = 0; 218 | st->pad[2] = 0; 219 | st->pad[3] = 0; 220 | } 221 | 222 | static void 223 | poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) { 224 | poly1305_state_ref_t st; 225 | size_t blocks; 226 | poly1305_init_ext_ref(&st, key, inlen); 227 | blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1)); 228 | if (blocks) { 229 | poly1305_blocks_ref(&st, in, blocks); 230 | in += blocks; 231 | inlen -= blocks; 232 | } 233 | poly1305_finish_ext_ref(&st, in, inlen, mac); 234 | } 235 | 236 | -------------------------------------------------------------------------------- /app/extensions/poly1305/poly1305_ref-64.inc: -------------------------------------------------------------------------------- 1 | /* 2 | poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition 3 | 4 | assumes the existence of uint64_t and uint128_t 5 | */ 6 | 7 | enum { 8 | POLY1305_BLOCK_SIZE = 16 9 | }; 10 | 11 | typedef struct poly1305_state_ref_t { 12 | uint64_t r[3]; 13 | uint64_t h[3]; 14 | uint64_t pad[2]; 15 | unsigned char final; 16 | } poly1305_state_ref_t; 17 | 18 | /* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */ 19 | static uint64_t 20 | U8TO64(const unsigned char *p) { 21 | return 22 | ((uint64_t)p[0] ) | 23 | ((uint64_t)p[1] << 8) | 24 | ((uint64_t)p[2] << 16) | 25 | ((uint64_t)p[3] << 24) | 26 | ((uint64_t)p[4] << 32) | 27 | ((uint64_t)p[5] << 40) | 28 | ((uint64_t)p[6] << 48) | 29 | ((uint64_t)p[7] << 56); 30 | } 31 | 32 | /* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */ 33 | static void 34 | U64TO8(unsigned char *p, uint64_t v) { 35 | p[0] = (unsigned char)(v ) & 0xff; 36 | p[1] = (unsigned char)(v >> 8) & 0xff; 37 | p[2] = (unsigned char)(v >> 16) & 0xff; 38 | p[3] = (unsigned char)(v >> 24) & 0xff; 39 | p[4] = (unsigned char)(v >> 32) & 0xff; 40 | p[5] = (unsigned char)(v >> 40) & 0xff; 41 | p[6] = (unsigned char)(v >> 48) & 0xff; 42 | p[7] = (unsigned char)(v >> 56) & 0xff; 43 | } 44 | 45 | static size_t 46 | poly1305_block_size_ref(void) { 47 | return POLY1305_BLOCK_SIZE; 48 | } 49 | 50 | static void 51 | poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) { 52 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; 53 | uint64_t t0, t1; 54 | 55 | /* bytes_hint not used */ 56 | (void)bytes_hint; 57 | 58 | /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 59 | t0 = U8TO64(&key->b[0]); 60 | t1 = U8TO64(&key->b[8]); 61 | st->r[0] = ( t0 ) & 0xffc0fffffff; 62 | st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; 63 | st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f; 64 | 65 | /* h = 0 */ 66 | st->h[0] = 0; 67 | st->h[1] = 0; 68 | st->h[2] = 0; 69 | 70 | /* save pad for later */ 71 | st->pad[0] = U8TO64(&key->b[16]); 72 | st->pad[1] = U8TO64(&key->b[24]); 73 | 74 | st->final = 0; 75 | } 76 | 77 | static void 78 | poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) { 79 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; 80 | const uint64_t hibit = (st->final) ? 0 : ((uint64_t)1 << 40); /* 1 << 128 */ 81 | uint64_t r0,r1,r2; 82 | uint64_t s1,s2; 83 | uint64_t h0,h1,h2; 84 | uint64_t c; 85 | uint128_t d0,d1,d2; 86 | 87 | r0 = st->r[0]; 88 | r1 = st->r[1]; 89 | r2 = st->r[2]; 90 | 91 | s1 = r1 * (5 << 2); 92 | s2 = r2 * (5 << 2); 93 | 94 | h0 = st->h[0]; 95 | h1 = st->h[1]; 96 | h2 = st->h[2]; 97 | 98 | while (inlen >= POLY1305_BLOCK_SIZE) { 99 | uint64_t t0, t1; 100 | 101 | /* h += in[i] */ 102 | t0 = U8TO64(in + 0); 103 | t1 = U8TO64(in + 8); 104 | h0 += (( t0 ) & 0xfffffffffff); 105 | h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff); 106 | h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit; 107 | 108 | /* h *= r */ 109 | d0 = ((uint128_t)h0 * r0) + ((uint128_t)h1 * s2) + ((uint128_t)h2 * s1); 110 | d1 = ((uint128_t)h0 * r1) + ((uint128_t)h1 * r0) + ((uint128_t)h2 * s2); 111 | d2 = ((uint128_t)h0 * r2) + ((uint128_t)h1 * r1) + ((uint128_t)h2 * r0); 112 | 113 | /* (partial) h %= p */ 114 | c = (uint64_t)(d0 >> 44); h0 = (uint64_t)d0 & 0xfffffffffff; 115 | d1 += c; c = (uint64_t)(d1 >> 44); h1 = (uint64_t)d1 & 0xfffffffffff; 116 | d2 += c; c = (uint64_t)(d2 >> 42); h2 = (uint64_t)d2 & 0x3ffffffffff; 117 | h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff; 118 | h1 += c; 119 | 120 | in += POLY1305_BLOCK_SIZE; 121 | inlen -= POLY1305_BLOCK_SIZE; 122 | } 123 | 124 | st->h[0] = h0; 125 | st->h[1] = h1; 126 | st->h[2] = h2; 127 | } 128 | 129 | static void 130 | poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) { 131 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; 132 | uint64_t h0, h1, h2, c; 133 | uint64_t g0, g1, g2; 134 | uint64_t t0, t1; 135 | 136 | /* process the remaining block */ 137 | if (remaining) { 138 | unsigned char final[POLY1305_BLOCK_SIZE] = {0}; 139 | size_t i; 140 | for (i = 0; i < remaining; i++) 141 | final[i] = in[i]; 142 | final[remaining] = 1; 143 | st->final = 1; 144 | poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE); 145 | } 146 | 147 | /* fully carry h */ 148 | h0 = st->h[0]; 149 | h1 = st->h[1]; 150 | h2 = st->h[2]; 151 | 152 | c = (h1 >> 44); h1 &= 0xfffffffffff; 153 | h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; 154 | h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; 155 | h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; 156 | h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; 157 | h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; 158 | h1 += c; 159 | 160 | /* compute h + -p */ 161 | g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff; 162 | g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff; 163 | g2 = h2 + c - ((uint64_t)1 << 42); 164 | 165 | /* select h if h < p, or h + -p if h >= p */ 166 | c = (g2 >> 63) - 1; 167 | h0 = (h0 & ~c) | (g0 & c); 168 | h1 = (h1 & ~c) | (g1 & c); 169 | h2 = (h2 & ~c) | (g2 & c); 170 | 171 | /* h = (h + pad) */ 172 | t0 = st->pad[0]; 173 | t1 = st->pad[1]; 174 | 175 | h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff; 176 | h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff; 177 | h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff; 178 | 179 | /* mac = h % (2^128) */ 180 | h0 = ((h0 ) | (h1 << 44)); 181 | h1 = ((h1 >> 20) | (h2 << 24)); 182 | 183 | U64TO8(&mac[0], h0); 184 | U64TO8(&mac[8], h1); 185 | 186 | /* zero out the state */ 187 | st->h[0] = 0; 188 | st->h[1] = 0; 189 | st->h[2] = 0; 190 | st->r[0] = 0; 191 | st->r[1] = 0; 192 | st->r[2] = 0; 193 | st->pad[0] = 0; 194 | st->pad[1] = 0; 195 | } 196 | 197 | 198 | static void 199 | poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) { 200 | poly1305_state_ref_t st; 201 | size_t blocks; 202 | poly1305_init_ext_ref(&st, key, inlen); 203 | blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1)); 204 | if (blocks) { 205 | poly1305_blocks_ref(&st, in, blocks); 206 | in += blocks; 207 | inlen -= blocks; 208 | } 209 | poly1305_finish_ext_ref(&st, in, inlen, mac); 210 | } 211 | 212 | -------------------------------------------------------------------------------- /app/extensions/poly1305/poly1305_ref-8.inc: -------------------------------------------------------------------------------- 1 | /* 2 | poly1305 implementation using 8 bit * 8 bit = 16 bit multiplication and 32 bit addition 3 | 4 | based on the public domain reference version in supercop by djb 5 | */ 6 | 7 | enum { 8 | POLY1305_BLOCK_SIZE = 16 9 | }; 10 | 11 | typedef struct poly1305_state_ref_t { 12 | unsigned char r[17]; 13 | unsigned char h[17]; 14 | unsigned char pad[17]; 15 | unsigned char final; 16 | } poly1305_state_ref_t; 17 | 18 | static size_t 19 | poly1305_block_size_ref(void) { 20 | return POLY1305_BLOCK_SIZE; 21 | } 22 | 23 | static void 24 | poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) { 25 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; 26 | size_t i; 27 | 28 | /* bytes_hint not used */ 29 | (void)bytes_hint; 30 | 31 | /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 32 | for (i = 0; i < 16; i++) st->r[i] = key->b[i]; 33 | st->r[3] &= 0x0f; 34 | st->r[4] &= 0xfc; 35 | st->r[7] &= 0x0f; 36 | st->r[8] &= 0xfc; 37 | st->r[11] &= 0x0f; 38 | st->r[12] &= 0xfc; 39 | st->r[15] &= 0x0f; 40 | st->r[16] = 0; 41 | 42 | /* h = 0 */ 43 | for (i = 0; i < 17; i++) st->h[i] = 0; 44 | 45 | /* save pad for later */ 46 | for (i = 0; i < 16; i++) st->pad[i] = key->b[i + 16]; 47 | st->pad[16] = 0; 48 | 49 | st->final = 0; 50 | } 51 | 52 | static void 53 | poly1305_add(unsigned char h[17], const unsigned char c[17]) { 54 | unsigned short u = 0; 55 | size_t i; 56 | for (i = 0; i < 17; i++) { 57 | u += (unsigned short)h[i] + (unsigned short)c[i]; 58 | h[i] = (unsigned char)u & 0xff; 59 | u >>= 8; 60 | } 61 | } 62 | 63 | static void 64 | poly1305_partial_reduce(unsigned char h[17], unsigned long hr[17]) { 65 | unsigned long u; 66 | size_t i; 67 | u = 0; 68 | for (i = 0; i < 16; i++) { 69 | u += hr[i]; 70 | h[i] = (unsigned char)u & 0xff; 71 | u >>= 8; 72 | } 73 | u += hr[16]; 74 | h[16] = (unsigned char)u & 0x03; 75 | u >>= 2; 76 | u += (u << 2); /* u *= 5; */ 77 | for (i = 0; i < 16; i++) { 78 | u += h[i]; 79 | h[i] = (unsigned char)u & 0xff; 80 | u >>= 8; 81 | } 82 | h[16] += (unsigned char)u; 83 | } 84 | 85 | static void 86 | poly1305_full_reduce(unsigned char h[17]) { 87 | static const unsigned char minusp[17] = { 88 | 0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 89 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 90 | 0xfc 91 | }; 92 | unsigned char horig[17], negative; 93 | size_t i; 94 | 95 | /* compute h + -p */ 96 | for (i = 0; i < 17; i++) horig[i] = h[i]; 97 | poly1305_add(h, minusp); 98 | 99 | /* select h if h < p, or h + -p if h >= p */ 100 | negative = -(h[16] >> 7); 101 | for (i = 0; i < 17; i++) 102 | h[i] ^= negative & (horig[i] ^ h[i]); 103 | } 104 | 105 | 106 | static void 107 | poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) { 108 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; 109 | const unsigned char hibit = st->final ? 0 : 1; /* 1 << 128 */ 110 | 111 | while (inlen >= POLY1305_BLOCK_SIZE) { 112 | unsigned long hr[17], u; 113 | unsigned char c[17]; 114 | size_t i, j; 115 | 116 | /* h += m */ 117 | for (i = 0; i < 16; i++) 118 | c[i] = in[i]; 119 | c[16] = hibit; 120 | poly1305_add(st->h, c); 121 | 122 | /* h *= r */ 123 | for (i = 0; i < 17; i++) { 124 | u = 0; 125 | for (j = 0; j <= i ; j++) { 126 | u += (unsigned short)st->h[j] * st->r[i - j]; 127 | } 128 | for (j = i + 1; j < 17; j++) { 129 | unsigned long v = (unsigned short)st->h[j] * st->r[i + 17 - j]; 130 | v = ((v << 8) + (v << 6)); /* v *= (5 << 6); */ 131 | u += v; 132 | } 133 | hr[i] = u; 134 | } 135 | 136 | /* (partial) h %= p */ 137 | poly1305_partial_reduce(st->h, hr); 138 | 139 | in += POLY1305_BLOCK_SIZE; 140 | inlen -= POLY1305_BLOCK_SIZE; 141 | } 142 | } 143 | 144 | static void 145 | poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) { 146 | poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; 147 | size_t i; 148 | 149 | /* process the remaining block */ 150 | if (remaining) { 151 | unsigned char final[POLY1305_BLOCK_SIZE] = {0}; 152 | size_t i; 153 | for (i = 0; i < remaining; i++) 154 | final[i] = in[i]; 155 | final[remaining] = 1; 156 | st->final = 1; 157 | poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE); 158 | } 159 | 160 | /* fully reduce h */ 161 | poly1305_full_reduce(st->h); 162 | 163 | /* h = (h + pad) % (1 << 128) */ 164 | poly1305_add(st->h, st->pad); 165 | for (i = 0; i < 16; i++) mac[i] = st->h[i]; 166 | 167 | /* zero out the state */ 168 | for (i = 0; i < 17; i++) st->r[i] = 0; 169 | for (i = 0; i < 17; i++) st->h[i] = 0; 170 | for (i = 0; i < 17; i++) st->pad[i] = 0; 171 | } 172 | 173 | static void 174 | poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) { 175 | poly1305_state_ref_t st; 176 | size_t blocks; 177 | poly1305_init_ext_ref(&st, key, inlen); 178 | blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1)); 179 | if (blocks) { 180 | poly1305_blocks_ref(&st, in, blocks); 181 | in += blocks; 182 | inlen -= blocks; 183 | } 184 | poly1305_finish_ext_ref(&st, in, inlen, mac); 185 | } 186 | 187 | -------------------------------------------------------------------------------- /app/extensions/poly1305/poly1305_x86-32.inc: -------------------------------------------------------------------------------- 1 | /* cannibalized from the public domain x86 implementation in supercop by djb */ 2 | 3 | SECTION_TEXT 4 | 5 | GLOBAL_HIDDEN_FN poly1305_block_size_x86 6 | movl $16, %eax 7 | ret 8 | FN_END poly1305_block_size_x86 9 | 10 | GLOBAL_HIDDEN_FN poly1305_init_ext_x86 11 | poly1305_init_ext_x86_local: 12 | pushl %ebx 13 | pushl %esi 14 | pushl %edi 15 | pushl %eax 16 | pushl %eax 17 | movl 24(%esp), %eax 18 | movl 28(%esp), %edx 19 | movl $0x137f, %ecx 20 | movl %ecx, 0(%esp) 21 | fstcw 4(%esp) 22 | fldcw 0(%esp) 23 | movl 16(%edx), %ecx 24 | movl 20(%edx), %ebx 25 | movl 24(%edx), %esi 26 | movl 28(%edx), %edi 27 | movl %ecx, 104(%eax) 28 | movl %ebx, 108(%eax) 29 | movl %esi, 112(%eax) 30 | movl %edi, 116(%eax) 31 | movl 0(%edx), %ecx 32 | movl 4(%edx), %ebx 33 | movl 8(%edx), %esi 34 | movl 12(%edx), %edi 35 | andl $0x0fffffff, %ecx 36 | andl $0x0ffffffc, %ebx 37 | andl $0x0ffffffc, %esi 38 | andl $0x0ffffffc, %edi 39 | movl %ecx, 0(%eax) 40 | movl $0x43300000, 4(%eax) 41 | movl %ebx, 8(%eax) 42 | movl $0x45300000, 12(%eax) 43 | movl %esi, 16(%eax) 44 | movl $0x47300000, 20(%eax) 45 | movl %edi, 24(%eax) 46 | movl $0x49300000, 28(%eax) 47 | LOAD_VAR_PIC poly1305_constants_x86, %edx 48 | fldl 0(%eax) 49 | fsubl 64(%edx) 50 | fldl 8(%eax) 51 | fsubl 72(%edx) 52 | fldl 16(%eax) 53 | fsubl 80(%edx) 54 | fldl 24(%eax) 55 | fsubl 88(%edx) 56 | fxch %st(3) 57 | fstpl 0(%eax) 58 | fxch %st(1) 59 | fstl 8(%eax) 60 | fmull 0(%edx) 61 | fstpl 16(%eax) 62 | fstl 24(%eax) 63 | fmull 0(%edx) 64 | fstpl 32(%eax) 65 | fstl 40(%eax) 66 | fmull 0(%edx) 67 | fstpl 48(%eax) 68 | fldz 69 | fstl 56(%eax) 70 | fstl 64(%eax) 71 | fstl 72(%eax) 72 | fstl 80(%eax) 73 | fstl 88(%eax) 74 | fstpl 96(%eax) 75 | fldcw 4(%esp) 76 | popl %eax 77 | popl %eax 78 | popl %edi 79 | popl %esi 80 | popl %ebx 81 | ret 82 | FN_END poly1305_init_ext_x86 83 | 84 | 85 | GLOBAL_HIDDEN_FN poly1305_blocks_x86 86 | poly1305_blocks_x86_local: 87 | movl %esp,%eax 88 | andl $63,%eax 89 | addl $192,%eax 90 | subl %eax,%esp 91 | movl %eax,0(%esp) 92 | movl %ebx,4(%esp) 93 | movl %esi,8(%esp) 94 | movl %edi,12(%esp) 95 | movl %ebp,16(%esp) 96 | movl $0x137f, %ecx 97 | movl %ecx, 188(%esp) 98 | fstcw 184(%esp) 99 | fldcw 188(%esp) 100 | movl $0x43300000,100(%esp) 101 | movl $0x45300000,108(%esp) 102 | movl $0x47300000,116(%esp) 103 | movl $0x49300000,124(%esp) 104 | movl 4(%esp,%eax),%ebp 105 | movl 8(%esp,%eax),%esi 106 | movl 12(%esp,%eax),%ecx 107 | LOAD_VAR_PIC poly1305_constants_x86, %edx 108 | cmp $16,%ecx 109 | jb poly1305_blocks_x86_nomorebytes 110 | fldt 92(%ebp) 111 | fldt 80(%ebp) 112 | fldt 68(%ebp) 113 | fldt 56(%ebp) 114 | add $16,%esi 115 | sub $16,%ecx 116 | movl %ecx, 20(%esp) 117 | movl -4(%esi),%eax 118 | movl -8(%esi),%ecx 119 | movl -12(%esi),%ebx 120 | movl -16(%esi),%edi 121 | movl %eax,120(%esp) 122 | movl %ecx,112(%esp) 123 | movl %ebx,104(%esp) 124 | movl %edi,96(%esp) 125 | fxch %st(3) 126 | faddl 120(%esp) 127 | fsubl 96(%edx) 128 | fxch %st(1) 129 | faddl 104(%esp) 130 | fsubl 72(%edx) 131 | fxch %st(2) 132 | faddl 112(%esp) 133 | fsubl 80(%edx) 134 | fxch %st(3) 135 | faddl 96(%esp) 136 | fsubl 64(%edx) 137 | movl 20(%esp), %ecx 138 | cmp $16, %ecx 139 | jb poly1305_blocks_x86_lastmultiply 140 | poly1305_blocks_x86_multiplyaddatleast16bytes: 141 | add $16,%esi 142 | sub $16,%ecx 143 | movl %ecx, 20(%esp) 144 | movl -4(%esi),%eax 145 | movl -8(%esi),%ecx 146 | movl -12(%esi),%ebx 147 | movl -16(%esi),%edi 148 | movl %eax,120(%esp) 149 | movl %ecx,112(%esp) 150 | movl %ebx,104(%esp) 151 | movl %edi,96(%esp) 152 | fldl 56(%edx) 153 | fadd %st(2),%st(0) 154 | fsubl 56(%edx) 155 | fsubr %st(0),%st(2) 156 | fmull 0(%edx) 157 | fldl 32(%edx) 158 | fadd %st(2),%st(0) 159 | fsubl 32(%edx) 160 | fsubr %st(0),%st(2) 161 | fxch %st(2) 162 | faddp %st(0),%st(1) 163 | fldl 40(%edx) 164 | fadd %st(4),%st(0) 165 | fsubl 40(%edx) 166 | fsubr %st(0),%st(4) 167 | fldl 48(%edx) 168 | fadd %st(6),%st(0) 169 | fsubl 48(%edx) 170 | fsubr %st(0),%st(6) 171 | fxch %st(6) 172 | faddp %st(0),%st(1) 173 | fxch %st(3) 174 | faddp %st(0),%st(5) 175 | fxch %st(3) 176 | faddp %st(0),%st(1) 177 | fldl 40(%ebp) 178 | fmul %st(3),%st(0) 179 | fldl 24(%ebp) 180 | fmul %st(4),%st(0) 181 | fldl 8(%ebp) 182 | fmul %st(5),%st(0) 183 | fldl 0(%ebp) 184 | fmulp %st(0),%st(6) 185 | fldl 24(%ebp) 186 | fmul %st(4),%st(0) 187 | faddp %st(0),%st(3) 188 | fldl 8(%ebp) 189 | fmul %st(4),%st(0) 190 | faddp %st(0),%st(2) 191 | fldl 0(%ebp) 192 | fmul %st(4),%st(0) 193 | faddp %st(0),%st(1) 194 | fldl 48(%ebp) 195 | fmulp %st(0),%st(4) 196 | fxch %st(3) 197 | faddp %st(0),%st(5) 198 | fldl 8(%ebp) 199 | fmul %st(4),%st(0) 200 | faddp %st(0),%st(2) 201 | fldl 0(%ebp) 202 | fmul %st(4),%st(0) 203 | faddp %st(0),%st(1) 204 | fldl 48(%ebp) 205 | fmul %st(4),%st(0) 206 | faddp %st(0),%st(3) 207 | fldl 32(%ebp) 208 | fmulp %st(0),%st(4) 209 | fxch %st(3) 210 | faddp %st(0),%st(4) 211 | fldl 0(%ebp) 212 | fmul %st(5),%st(0) 213 | faddp %st(0),%st(1) 214 | fxch %st(3) 215 | fldl 48(%ebp) 216 | fmul %st(5),%st(0) 217 | faddp %st(0),%st(3) 218 | fxch %st(1) 219 | fldl 32(%ebp) 220 | fmul %st(5),%st(0) 221 | faddp %st(0),%st(1) 222 | fldl 16(%ebp) 223 | fmulp %st(0),%st(5) 224 | fxch %st(4) 225 | faddp %st(0),%st(1) 226 | movl 20(%esp), %ecx 227 | fxch %st(2) 228 | fldl 120(%esp) 229 | fsubl 96(%edx) 230 | faddp %st(0),%st(1) 231 | fxch %st(1) 232 | fldl 112(%esp) 233 | fsubl 80(%edx) 234 | cmp $16,%ecx 235 | faddp %st(0),%st(1) 236 | fxch %st(3) 237 | fldl 104(%esp) 238 | fsubl 72(%edx) 239 | faddp %st(0),%st(1) 240 | fxch %st(2) 241 | fldl 96(%esp) 242 | fsubl 64(%edx) 243 | faddp %st(0),%st(1) 244 | jae poly1305_blocks_x86_multiplyaddatleast16bytes 245 | poly1305_blocks_x86_lastmultiply: 246 | fldl 56(%edx) 247 | fadd %st(2),%st(0) 248 | fsubl 56(%edx) 249 | fsubr %st(0),%st(2) 250 | fmull 0(%edx) 251 | fldl 32(%edx) 252 | fadd %st(2),%st(0) 253 | fsubl 32(%edx) 254 | fsubr %st(0),%st(2) 255 | fldl 40(%edx) 256 | fadd %st(5),%st(0) 257 | fsubl 40(%edx) 258 | fsubr %st(0),%st(5) 259 | fldl 48(%edx) 260 | fadd %st(7),%st(0) 261 | fsubl 48(%edx) 262 | fsubr %st(0),%st(7) 263 | fxch %st(7) 264 | faddp %st(0),%st(1) 265 | fxch %st(5) 266 | faddp %st(0),%st(1) 267 | fxch %st(3) 268 | faddp %st(0),%st(5) 269 | faddp %st(0),%st(1) 270 | fldl 40(%ebp) 271 | fmul %st(1),%st(0) 272 | fldl 24(%ebp) 273 | fmul %st(2),%st(0) 274 | fldl 8(%ebp) 275 | fmul %st(3),%st(0) 276 | fldl 0(%ebp) 277 | fmulp %st(0),%st(4) 278 | fldl 24(%ebp) 279 | fmul %st(5),%st(0) 280 | faddp %st(0),%st(3) 281 | fldl 8(%ebp) 282 | fmul %st(5),%st(0) 283 | faddp %st(0),%st(2) 284 | fldl 0(%ebp) 285 | fmul %st(5),%st(0) 286 | faddp %st(0),%st(1) 287 | fldl 48(%ebp) 288 | fmulp %st(0),%st(5) 289 | fxch %st(4) 290 | faddp %st(0),%st(3) 291 | fldl 8(%ebp) 292 | fmul %st(5),%st(0) 293 | faddp %st(0),%st(2) 294 | fldl 0(%ebp) 295 | fmul %st(5),%st(0) 296 | faddp %st(0),%st(1) 297 | fldl 48(%ebp) 298 | fmul %st(5),%st(0) 299 | faddp %st(0),%st(4) 300 | fldl 32(%ebp) 301 | fmulp %st(0),%st(5) 302 | fxch %st(4) 303 | faddp %st(0),%st(2) 304 | fldl 0(%ebp) 305 | fmul %st(5),%st(0) 306 | faddp %st(0),%st(1) 307 | fldl 48(%ebp) 308 | fmul %st(5),%st(0) 309 | faddp %st(0),%st(4) 310 | fldl 32(%ebp) 311 | fmul %st(5),%st(0) 312 | faddp %st(0),%st(3) 313 | fldl 16(%ebp) 314 | fmulp %st(0),%st(5) 315 | fxch %st(4) 316 | faddp %st(0),%st(1) 317 | fstpt 56(%ebp) 318 | fstpt 68(%ebp) 319 | fstpt 80(%ebp) 320 | fstpt 92(%ebp) 321 | poly1305_blocks_x86_nomorebytes: 322 | fldcw 184(%esp) 323 | movl 0(%esp), %eax 324 | movl 4(%esp), %ebx 325 | movl 8(%esp), %esi 326 | movl 12(%esp), %edi 327 | movl 16(%esp), %ebp 328 | addl %eax, %esp 329 | ret 330 | FN_END poly1305_blocks_x86 331 | 332 | GLOBAL_HIDDEN_FN poly1305_finish_ext_x86 333 | poly1305_finish_ext_x86_local: 334 | pushl %ebx 335 | pushl %esi 336 | pushl %edi 337 | pushl %ebp 338 | mov %esp, %ebp 339 | andl $~63, %esp 340 | subl $256, %esp 341 | movl $0x137f, %ecx 342 | movl %ecx, 0(%esp) 343 | fstcw 4(%esp) 344 | fldcw 0(%esp) 345 | mov 20(%ebp), %ebx 346 | mov 24(%ebp), %esi 347 | mov 28(%ebp), %ecx 348 | mov 32(%ebp), %eax 349 | movl %ebp, 0(%esp) 350 | movl %ebx, 4(%esp) 351 | movl %eax, 8(%esp) 352 | fldt 92(%ebx) 353 | fldt 80(%ebx) 354 | fldt 68(%ebx) 355 | fldt 56(%ebx) 356 | andl %ecx, %ecx 357 | jz poly1305_finish_x86_nomorebytes 358 | movl $0x43300000,100(%esp) 359 | movl $0x45300000,108(%esp) 360 | movl $0x47300000,116(%esp) 361 | movl $0x49300000,124(%esp) 362 | movl $0,64(%esp) 363 | movl $0,4+64(%esp) 364 | movl $0,8+64(%esp) 365 | movl $0,12+64(%esp) 366 | leal 64(%esp),%edi 367 | rep movsb 368 | movb $1,0(%edi) 369 | movl 12+64(%esp),%eax 370 | movl 8+64(%esp),%ecx 371 | movl 4+64(%esp),%edx 372 | movl 64(%esp),%esi 373 | movl %eax,120(%esp) 374 | movl %ecx,112(%esp) 375 | movl %edx,104(%esp) 376 | movl %esi,96(%esp) 377 | LOAD_VAR_PIC poly1305_constants_x86, %edx 378 | fxch %st(3) 379 | faddl 120(%esp) 380 | fsubl 88(%edx) 381 | fxch %st(2) 382 | faddl 112(%esp) 383 | fsubl 80(%edx) 384 | fxch %st(1) 385 | faddl 104(%esp) 386 | fsubl 72(%edx) 387 | fxch %st(3) 388 | faddl 96(%esp) 389 | fsubl 64(%edx) 390 | fldl 56(%edx) 391 | fadd %st(3),%st(0) 392 | fsubl 56(%edx) 393 | fsubr %st(0),%st(3) 394 | fmull 0(%edx) 395 | fldl 32(%edx) 396 | fadd %st(2),%st(0) 397 | fsubl 32(%edx) 398 | fsubr %st(0),%st(2) 399 | fldl 40(%edx) 400 | fadd %st(6),%st(0) 401 | fsubl 40(%edx) 402 | fsubr %st(0),%st(6) 403 | fldl 48(%edx) 404 | fadd %st(5),%st(0) 405 | fsubl 48(%edx) 406 | fsubr %st(0),%st(5) 407 | fxch %st(4) 408 | faddp %st(0),%st(3) 409 | fxch %st(6) 410 | faddp %st(0),%st(1) 411 | fxch %st(3) 412 | faddp %st(0),%st(5) 413 | fxch %st(3) 414 | faddp %st(0),%st(1) 415 | fldl 40(%ebx) 416 | fmul %st(3),%st(0) 417 | fldl 24(%ebx) 418 | fmul %st(4),%st(0) 419 | fldl 8(%ebx) 420 | fmul %st(5),%st(0) 421 | fldl 0(%ebx) 422 | fmulp %st(0),%st(6) 423 | fldl 24(%ebx) 424 | fmul %st(5),%st(0) 425 | faddp %st(0),%st(3) 426 | fldl 8(%ebx) 427 | fmul %st(5),%st(0) 428 | faddp %st(0),%st(2) 429 | fldl 0(%ebx) 430 | fmul %st(5),%st(0) 431 | faddp %st(0),%st(1) 432 | fldl 48(%ebx) 433 | fmulp %st(0),%st(5) 434 | fxch %st(4) 435 | faddp %st(0),%st(5) 436 | fldl 8(%ebx) 437 | fmul %st(6),%st(0) 438 | faddp %st(0),%st(2) 439 | fldl 0(%ebx) 440 | fmul %st(6),%st(0) 441 | faddp %st(0),%st(1) 442 | fldl 48(%ebx) 443 | fmul %st(6),%st(0) 444 | faddp %st(0),%st(4) 445 | fldl 32(%ebx) 446 | fmulp %st(0),%st(6) 447 | fxch %st(5) 448 | faddp %st(0),%st(4) 449 | fldl 0(%ebx) 450 | fmul %st(2),%st(0) 451 | faddp %st(0),%st(1) 452 | fldl 48(%ebx) 453 | fmul %st(2),%st(0) 454 | faddp %st(0),%st(5) 455 | fldl 32(%ebx) 456 | fmul %st(2),%st(0) 457 | faddp %st(0),%st(3) 458 | fldl 16(%ebx) 459 | fmulp %st(0),%st(2) 460 | fxch %st(1) 461 | faddp %st(0),%st(3) 462 | fxch %st(3) 463 | fxch %st(2) 464 | poly1305_finish_x86_nomorebytes: 465 | LOAD_VAR_PIC poly1305_constants_x86, %edx 466 | fldl 56(%edx) 467 | fadd %st(4),%st(0) 468 | fsubl 56(%edx) 469 | fsubr %st(0),%st(4) 470 | fmull 0(%edx) 471 | fldl 32(%edx) 472 | fadd %st(2),%st(0) 473 | fsubl 32(%edx) 474 | fsubr %st(0),%st(2) 475 | fldl 40(%edx) 476 | fadd %st(4),%st(0) 477 | fsubl 40(%edx) 478 | fsubr %st(0),%st(4) 479 | fldl 48(%edx) 480 | fadd %st(6),%st(0) 481 | fsubl 48(%edx) 482 | fxch %st(6) 483 | fsub %st(6),%st(0) 484 | fxch %st(4) 485 | faddp %st(0),%st(3) 486 | fxch %st(4) 487 | faddp %st(0),%st(1) 488 | fxch %st(2) 489 | faddp %st(0),%st(3) 490 | fxch %st(4) 491 | faddp %st(0),%st(3) 492 | fxch %st(3) 493 | faddl 104(%edx) 494 | fxch %st(3) 495 | faddl 112(%edx) 496 | fxch %st(1) 497 | faddl 120(%edx) 498 | fxch %st(2) 499 | faddl 128(%edx) 500 | fxch %st(3) 501 | fstpl 96(%esp) 502 | fstpl 104(%esp) 503 | fstpl 112(%esp) 504 | fstpl 120(%esp) 505 | movl 100(%esp),%eax 506 | and $63,%eax 507 | movl 108(%esp),%ecx 508 | and $63,%ecx 509 | movl 116(%esp),%edx 510 | and $63,%edx 511 | movl 124(%esp),%ebx 512 | and $63,%ebx 513 | movl 104(%esp),%esi 514 | addl %eax,%esi 515 | movl %esi,28(%esp) 516 | movl 112(%esp),%eax 517 | adcl %ecx,%eax 518 | movl %eax,32(%esp) 519 | movl 120(%esp),%eax 520 | adcl %edx,%eax 521 | movl %eax,36(%esp) 522 | mov $0,%eax 523 | adcl %ebx,%eax 524 | movl %eax,40(%esp) 525 | mov $5,%eax 526 | movl 96(%esp),%ecx 527 | addl %ecx,%eax 528 | movl %eax,44(%esp) 529 | mov $0,%eax 530 | movl 28(%esp),%edx 531 | adcl %edx,%eax 532 | movl %eax,28(%esp) 533 | mov $0,%eax 534 | movl 32(%esp),%ebx 535 | adcl %ebx,%eax 536 | movl %eax,32(%esp) 537 | mov $0,%eax 538 | movl 36(%esp),%esi 539 | adcl %esi,%eax 540 | movl %eax,36(%esp) 541 | mov $0xfffffffc,%eax 542 | movl 40(%esp),%edi 543 | adcl %edi,%eax 544 | sar $16,%eax 545 | mov %eax,%edi 546 | xor $0xffffffff,%edi 547 | andl %eax,%ecx 548 | movl 44(%esp),%ebp 549 | andl %edi,%ebp 550 | orl %ebp,%ecx 551 | andl %eax,%edx 552 | movl 28(%esp),%ebp 553 | andl %edi,%ebp 554 | orl %ebp,%edx 555 | andl %eax,%ebx 556 | movl 32(%esp),%ebp 557 | andl %edi,%ebp 558 | orl %ebp,%ebx 559 | andl %eax,%esi 560 | movl 36(%esp),%eax 561 | andl %edi,%eax 562 | orl %eax,%esi 563 | movl 4(%esp),%eax 564 | addl 104(%eax),%ecx 565 | adcl 108(%eax),%edx 566 | adcl 112(%eax),%ebx 567 | adcl 116(%eax),%esi 568 | movl 8(%esp),%eax 569 | movl %ecx,0(%eax) 570 | movl %edx,4(%eax) 571 | movl %ebx,8(%eax) 572 | movl %esi,12(%eax) 573 | xorl %eax, %eax 574 | movl 4(%esp),%edi 575 | fldz 576 | fstl 0(%edi) 577 | fstl 8(%edi) 578 | fstl 16(%edi) 579 | fstl 24(%edi) 580 | fstl 32(%edi) 581 | fstl 40(%edi) 582 | fstl 48(%edi) 583 | fstl 56(%edi) 584 | fstl 64(%edi) 585 | fstl 72(%edi) 586 | fstl 80(%edi) 587 | fstl 88(%edi) 588 | fstl 96(%edi) 589 | fstl 104(%edi) 590 | fstpl 112(%edi) 591 | movl 0(%esp), %esp 592 | popl %ebp 593 | popl %edi 594 | popl %esi 595 | popl %ebx 596 | ret 597 | FN_END poly1305_finish_ext_x86 598 | 599 | GLOBAL_HIDDEN_FN poly1305_auth_x86 600 | poly1305_auth_x86_local: 601 | pushl %ebp 602 | pushl %edi 603 | movl %esp, %ebp 604 | subl $128, %esp 605 | andl $~63, %esp 606 | movl %esp, %edi 607 | pushl 24(%ebp) 608 | pushl %edi 609 | calll poly1305_init_ext_x86_local 610 | movl 20(%ebp), %ecx 611 | andl $~15, %ecx 612 | jz poly1305_auth_x86_no_data 613 | pushl %ecx 614 | pushl 16(%ebp) 615 | addl %ecx, 16(%ebp) 616 | pushl %edi 617 | calll poly1305_blocks_x86_local 618 | poly1305_auth_x86_no_data: 619 | pushl 12(%ebp) 620 | movl 20(%ebp), %ecx 621 | andl $15, %ecx 622 | pushl %ecx 623 | pushl 16(%ebp) 624 | pushl %edi 625 | calll poly1305_finish_ext_x86_local 626 | movl %ebp, %esp 627 | popl %edi 628 | popl %ebp 629 | ret 630 | FN_END poly1305_auth_x86 631 | 632 | INCLUDE_VAR_FILE "poly1305/poly1305_constants_x86.inc", poly1305_constants_x86 633 | -------------------------------------------------------------------------------- /app/extensions/poly1305/poly1305_x86-64.inc: -------------------------------------------------------------------------------- 1 | SECTION_TEXT 2 | 3 | GLOBAL_HIDDEN_FN poly1305_block_size_x86 4 | movl $16, %eax 5 | ret 6 | FN_END poly1305_block_size_x86 7 | 8 | GLOBAL_HIDDEN_FN poly1305_init_ext_x86 9 | poly1305_init_ext_x86_local: 10 | movabsq $17575274610687, %rax 11 | movq (%rsi), %rcx 12 | movq 8(%rsi), %rdx 13 | movq $0, 24(%rdi) 14 | movq $0, 32(%rdi) 15 | movq $0, 40(%rdi) 16 | andq %rcx, %rax 17 | shrq $44, %rcx 18 | movq %rax, (%rdi) 19 | movq %rdx, %rax 20 | salq $20, %rax 21 | orq %rcx, %rax 22 | movabsq $17592181915647, %rcx 23 | andq %rcx, %rax 24 | movq %rax, 8(%rdi) 25 | movq %rdx, %rax 26 | movabsq $68719475727, %rdx 27 | shrq $24, %rax 28 | andq %rdx, %rax 29 | movq %rax, 16(%rdi) 30 | movq 16(%rsi), %rax 31 | movq %rax, 48(%rdi) 32 | movq 24(%rsi), %rax 33 | movq $0, 64(%rdi) 34 | movq %rax, 56(%rdi) 35 | ret 36 | FN_END poly1305_init_ext_x86 37 | 38 | 39 | GLOBAL_HIDDEN_FN poly1305_blocks_x86 40 | poly1305_blocks_x86_local: 41 | movabsq $1099511627776, %rax 42 | pushq %r15 43 | pushq %r14 44 | pushq %r13 45 | pushq %r12 46 | pushq %rbp 47 | movq %rdx, %rbp 48 | pushq %rbx 49 | cmpq $1, 64(%rdi) 50 | movq %rdi, -16(%rsp) 51 | movq (%rdi), %r14 52 | movq 8(%rdi), %r15 53 | sbbq %rcx, %rcx 54 | andq %rax, %rcx 55 | movq %rdi, %rax 56 | cmpq $15, %rbp 57 | movq %rcx, -40(%rsp) 58 | movq 16(%rdi), %rcx 59 | movq 32(%rax), %r8 60 | movq 24(%rdi), %rdi 61 | movq 40(%rax), %rdx 62 | movq %rcx, -32(%rsp) 63 | jbe poly1305_blocks_x86_5 64 | leaq (%rcx,%rcx,4), %rax 65 | movq %r15, -48(%rsp) 66 | movabsq $17592186044415, %rbx 67 | salq $2, %rax 68 | movq %rax, -56(%rsp) 69 | leaq (%r15,%r15,4), %rax 70 | salq $2, %rax 71 | movq %rax, -24(%rsp) 72 | .p2align 4 73 | poly1305_blocks_x86_6: 74 | movq $0, -80(%rsp) 75 | movq (%rsi), %r9 76 | movq $0, -64(%rsp) 77 | movq 8(%rsi), %rcx 78 | movq %r9, %rax 79 | shrq $44, %r9 80 | movq %rcx, %r10 81 | shrq $24, %rcx 82 | andq %rbx, %rax 83 | orq -40(%rsp), %rcx 84 | addq %rax, %rdi 85 | salq $20, %r10 86 | movq -24(%rsp), %rax 87 | orq %r9, %r10 88 | andq %rbx, %r10 89 | addq %r10, %r8 90 | addq %rdx, %rcx 91 | mulq %rcx 92 | movq %rax, %r9 93 | movq %rdi, %rax 94 | movq %rdx, %r10 95 | mulq %r14 96 | addq %rax, %r9 97 | movq -56(%rsp), %rax 98 | adcq %rdx, %r10 99 | mulq %r8 100 | addq %rax, %r9 101 | movq -56(%rsp), %rax 102 | adcq %rdx, %r10 103 | movq %r9, %r15 104 | andq %rbx, %r15 105 | mulq %rcx 106 | movq %rax, %r11 107 | movq -48(%rsp), %rax 108 | movq %rdx, %r12 109 | mulq %rdi 110 | addq %rax, %r11 111 | movq %r8, %rax 112 | adcq %rdx, %r12 113 | mulq %r14 114 | addq %rax, %r11 115 | movq %rcx, %rax 116 | adcq %rdx, %r12 117 | shrdq $44, %r10, %r9 118 | movq %r9, -88(%rsp) 119 | addq -88(%rsp), %r11 120 | adcq -80(%rsp), %r12 121 | mulq %r14 122 | movq %r11, %r13 123 | andq %rbx, %r13 124 | movq %rax, %r9 125 | movq -32(%rsp), %rax 126 | movq %rdx, %r10 127 | mulq %rdi 128 | addq %rax, %r9 129 | movq -48(%rsp), %rax 130 | adcq %rdx, %r10 131 | mulq %r8 132 | addq %rax, %r9 133 | adcq %rdx, %r10 134 | shrdq $44, %r12, %r11 135 | movabsq $4398046511103, %rdx 136 | movq %r11, -72(%rsp) 137 | addq -72(%rsp), %r9 138 | adcq -64(%rsp), %r10 139 | andq %r9, %rdx 140 | subq $16, %rbp 141 | addq $16, %rsi 142 | shrdq $42, %r10, %r9 143 | leaq (%r9,%r9,4), %r8 144 | addq %r15, %r8 145 | movq %r8, %rdi 146 | shrq $44, %r8 147 | andq %rbx, %rdi 148 | addq %r13, %r8 149 | cmpq $15, %rbp 150 | ja poly1305_blocks_x86_6 151 | poly1305_blocks_x86_5: 152 | movq -16(%rsp), %rcx 153 | movq %rdi, 24(%rcx) 154 | movq %r8, 32(%rcx) 155 | movq %rdx, 40(%rcx) 156 | popq %rbx 157 | popq %rbp 158 | popq %r12 159 | popq %r13 160 | popq %r14 161 | popq %r15 162 | ret 163 | FN_END poly1305_blocks_x86 164 | 165 | GLOBAL_HIDDEN_FN poly1305_finish_ext_x86 166 | poly1305_finish_ext_x86_local: 167 | pushq %rbp 168 | movq %rsi, %rax 169 | movq %rcx, %rbp 170 | pushq %rbx 171 | movq %rdi, %rbx 172 | subq $24, %rsp 173 | testq %rdx, %rdx 174 | je poly1305_finish_ext_x86_11 175 | movq %rax, %rdi 176 | movq $0, (%rsp) 177 | movq %rsp, %rcx 178 | movq $0, 8(%rsp) 179 | subq %rsp, %rdi 180 | testb $8, %dl 181 | je poly1305_finish_ext_x86_12 182 | movq (%rax), %rax 183 | leaq 8(%rsp), %rcx 184 | movq %rax, (%rsp) 185 | poly1305_finish_ext_x86_12: 186 | testb $4, %dl 187 | je poly1305_finish_ext_x86_13 188 | movl (%rcx,%rdi), %eax 189 | movl %eax, (%rcx) 190 | addq $4, %rcx 191 | poly1305_finish_ext_x86_13: 192 | testb $2, %dl 193 | je poly1305_finish_ext_x86_14 194 | movzwl (%rcx,%rdi), %eax 195 | movw %ax, (%rcx) 196 | addq $2, %rcx 197 | poly1305_finish_ext_x86_14: 198 | testb $1, %dl 199 | je poly1305_finish_ext_x86_15 200 | movzbl (%rcx,%rdi), %eax 201 | movb %al, (%rcx) 202 | poly1305_finish_ext_x86_15: 203 | movb $1, (%rsp,%rdx) 204 | movq %rsp, %rsi 205 | movl $16, %edx 206 | movq $1, 64(%rbx) 207 | movq %rbx, %rdi 208 | call poly1305_blocks_x86 209 | poly1305_finish_ext_x86_11: 210 | movabsq $17592186044415, %rdx 211 | movq 32(%rbx), %rsi 212 | movabsq $4398046511103, %rax 213 | movabsq $-4398046511104, %r10 214 | movq %rsi, %r9 215 | shrq $44, %rsi 216 | addq 40(%rbx), %rsi 217 | andq %rdx, %r9 218 | movq %rsi, %r8 219 | shrq $42, %rsi 220 | leaq (%rsi,%rsi,4), %rcx 221 | andq %rax, %r8 222 | addq 24(%rbx), %rcx 223 | movq %rcx, %rdi 224 | shrq $44, %rcx 225 | addq %r9, %rcx 226 | andq %rdx, %rdi 227 | movq %rcx, %rsi 228 | shrq $44, %rcx 229 | addq %r8, %rcx 230 | andq %rdx, %rsi 231 | andq %rcx, %rax 232 | shrq $42, %rcx 233 | leaq (%rdi,%rcx,4), %rdi 234 | addq %rax, %r10 235 | addq %rcx, %rdi 236 | movq %rdi, %rcx 237 | shrq $44, %rdi 238 | andq %rdx, %rcx 239 | addq %rsi, %rdi 240 | leaq 5(%rcx), %r9 241 | movq %r9, %r11 242 | andq %rdx, %r9 243 | shrq $44, %r11 244 | addq %rdi, %r11 245 | movq %r11, %rsi 246 | andq %r11, %rdx 247 | shrq $44, %rsi 248 | addq %rsi, %r10 249 | movq %r10, %rsi 250 | shrq $63, %rsi 251 | subq $1, %rsi 252 | movq %rsi, %r8 253 | andq %rsi, %r9 254 | andq %rsi, %rdx 255 | notq %r8 256 | andq %r8, %rcx 257 | andq %r8, %rdi 258 | orq %r9, %rcx 259 | orq %rdx, %rdi 260 | andq %r8, %rax 261 | andq %r10, %rsi 262 | movq %rdi, %rdx 263 | shrq $20, %rdi 264 | orq %rsi, %rax 265 | salq $44, %rdx 266 | movq 56(%rbx), %rsi 267 | salq $24, %rax 268 | orq %rdx, %rcx 269 | movq 48(%rbx), %rdx 270 | orq %rdi, %rax 271 | addq %rdx, %rcx 272 | adcq %rsi, %rax 273 | movq %rcx, 0(%rbp) 274 | movq %rax, 8(%rbp) 275 | movq $0, 24(%rbx) 276 | movq $0, 32(%rbx) 277 | movq $0, 40(%rbx) 278 | movq $0, (%rbx) 279 | movq $0, 8(%rbx) 280 | movq $0, 16(%rbx) 281 | movq $0, 48(%rbx) 282 | movq $0, 56(%rbx) 283 | addq $24, %rsp 284 | popq %rbx 285 | popq %rbp 286 | ret 287 | FN_END poly1305_finish_ext_x86 288 | 289 | 290 | GLOBAL_HIDDEN_FN poly1305_auth_x86 291 | poly1305_auth_x86_local: 292 | pushq %rbp 293 | movq %rsp, %rbp 294 | movq %rbx, -32(%rbp) 295 | movq %rdx, %rbx 296 | movq %r12, -24(%rbp) 297 | movq %rsi, %r12 298 | movq %rcx, %rsi 299 | movq %r13, -16(%rbp) 300 | movq %rdi, %r13 301 | movq %r14, -8(%rbp) 302 | subq $32, %rsp 303 | movq %rbx, %r14 304 | andq $-64, %rsp 305 | addq $-128, %rsp 306 | movq %rsp, %rdi 307 | call poly1305_init_ext_x86_local 308 | andq $-16, %r14 309 | je poly1305_auth_x86_19 310 | movq %r12, %rsi 311 | movq %r14, %rdx 312 | movq %rsp, %rdi 313 | call poly1305_blocks_x86_local 314 | addq %r14, %r12 315 | subq %r14, %rbx 316 | poly1305_auth_x86_19: 317 | movq %r13, %rcx 318 | movq %rbx, %rdx 319 | movq %r12, %rsi 320 | movq %rsp, %rdi 321 | call poly1305_finish_ext_x86_local 322 | movq -32(%rbp), %rbx 323 | movq -24(%rbp), %r12 324 | movq -16(%rbp), %r13 325 | movq -8(%rbp), %r14 326 | leave 327 | ret 328 | FN_END poly1305_auth_x86 329 | 330 | -------------------------------------------------------------------------------- /app/include/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floodyberry/poly1305-opt/700d5cf167441f627d76c845f56b7ea72bdd91e8/app/include/.keep -------------------------------------------------------------------------------- /app/include/poly1305.h: -------------------------------------------------------------------------------- 1 | #ifndef POLY1305_H 2 | #define POLY1305_H 3 | 4 | #include 5 | 6 | #if !defined(LIB_PUBLIC) 7 | #define LIB_PUBLIC 8 | #endif 9 | 10 | #if defined(__cplusplus) 11 | extern "C" { 12 | #endif 13 | 14 | typedef struct poly1305_state { 15 | unsigned char opaque[320]; 16 | } poly1305_state; 17 | 18 | typedef struct poly1305_key { 19 | unsigned char b[32]; 20 | } poly1305_key; 21 | 22 | LIB_PUBLIC void poly1305_init(poly1305_state *S, const poly1305_key *key); 23 | LIB_PUBLIC void poly1305_init_ext(poly1305_state *S, const poly1305_key *key, size_t bytes_hint); 24 | LIB_PUBLIC void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen); 25 | LIB_PUBLIC void poly1305_finish(poly1305_state *S, unsigned char *mac); 26 | 27 | LIB_PUBLIC void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, const poly1305_key *key); 28 | 29 | LIB_PUBLIC int poly1305_startup(void); 30 | 31 | #if defined(UTILITIES) 32 | void poly1305_fuzz(void); 33 | void poly1305_bench(void); 34 | #endif 35 | 36 | #if defined(__cplusplus) 37 | } 38 | #endif 39 | 40 | #endif /* POLY1305_H */ 41 | 42 | -------------------------------------------------------------------------------- /app/project.def: -------------------------------------------------------------------------------- 1 | poly1305 2 | -------------------------------------------------------------------------------- /app/project.ver: -------------------------------------------------------------------------------- 1 | 1.0.0 2 | -------------------------------------------------------------------------------- /framework/bench.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cpucycles.h" 4 | #include "cpuid.h" 5 | #include "bench.h" 6 | 7 | /* a 32k, 64 byte aligned buffer to bench with */ 8 | unsigned char * 9 | bench_get_buffer(void) { 10 | static unsigned char buffer[0x8000 + 0x40 + 0x40]; 11 | unsigned char *p = buffer; 12 | p += 0x3f; 13 | p -= (size_t)p & 0x3f; 14 | return p; 15 | } 16 | 17 | static cycles_t smallest_timeslice = ~(cycles_t)0; 18 | static int have_global_stats = 0; 19 | static cycles_t cycles_per_second = 1; 20 | static size_t global_dummy = 0; 21 | 22 | static void 23 | bench_gather_global_stats(void) { 24 | const char *cpu_units = LOCAL_PREFIX(cpucycles_units)(); 25 | size_t delay = 0; 26 | size_t dummy = 55; 27 | clock_t start; 28 | cycles_t delta; 29 | size_t j; 30 | 31 | /* find the smallest one and run with that, this isn't an exact science */ 32 | do { 33 | delta = LOCAL_PREFIX(cpucycles)(); 34 | for (j = 0; j < delay; j++) { 35 | dummy ^= (dummy << 1) + j; 36 | dummy += (dummy >> 3); 37 | } 38 | delta = LOCAL_PREFIX(cpucycles)() - delta; 39 | delay++; 40 | } while (!delta); 41 | 42 | /* run until at least one second has passed AND smallest_timeslice has been set */ 43 | start = clock(); 44 | do { 45 | delta = LOCAL_PREFIX(cpucycles)(); 46 | for (j = 0; j < delay; j++) { 47 | dummy ^= (dummy << 1) + j; 48 | dummy += (dummy >> 3); 49 | } 50 | delta = LOCAL_PREFIX(cpucycles)() - delta; 51 | 52 | /* 2 is as good as 1 cycle_t, and should avoid some burps that gettimeofday has with erroneously reporting 1 cycle_t */ 53 | if ((delta > 1) && (delta < smallest_timeslice)) 54 | smallest_timeslice = delta; 55 | } while (((clock() - start) < CLOCKS_PER_SEC) && (smallest_timeslice == ~(cycles_t)0)); 56 | 57 | /* 1/2 of a second back of the hand calculation for cycles_t per second */ 58 | cycles_per_second = LOCAL_PREFIX(cpucycles)(); 59 | start = clock(); 60 | while ((clock() - start) < (CLOCKS_PER_SEC / 2)) { 61 | dummy ^= (dummy << 1) + 19; 62 | dummy += (dummy >> 3); 63 | } 64 | cycles_per_second = LOCAL_PREFIX(cpucycles)() - cycles_per_second; 65 | cycles_per_second <<= 1; 66 | 67 | 68 | printf("time granularity: %.0f %s, %.0f %s/second\n\n", (double)smallest_timeslice, cpu_units, (double)cycles_per_second, cpu_units); 69 | 70 | global_dummy = dummy & 1; 71 | } 72 | 73 | int 74 | bench(const void *impls, size_t impl_size, impl_test test_fn, impl_bench bench_fn, size_t units_count, const char *units_desc) { 75 | unsigned long cpu_flags = LOCAL_PREFIX(cpuid)(); 76 | const char *cpu_units = LOCAL_PREFIX(cpucycles_units)(); 77 | const unsigned char *p; 78 | int first_item = 1, err = 0; 79 | 80 | if (!have_global_stats) { 81 | bench_gather_global_stats(); 82 | have_global_stats = 1; 83 | } 84 | 85 | /* validate all implementations */ 86 | p = (const unsigned char *)impls; 87 | for (;;) { 88 | const cpu_specific_impl_t *impl = (const cpu_specific_impl_t *)p; 89 | if (impl->cpu_flags == (impl->cpu_flags & cpu_flags)) { 90 | if (test_fn(impl) != 0) { 91 | printf("%s: error in implementation!\n", impl->desc); 92 | err = 1; 93 | } 94 | } 95 | if (impl->cpu_flags == CPUID_GENERIC) 96 | break; 97 | p += impl_size; 98 | } 99 | 100 | if (err) 101 | return 1; 102 | 103 | p = (const unsigned char *)impls; 104 | for (;;) { 105 | const cpu_specific_impl_t *impl = (const cpu_specific_impl_t *)p; 106 | 107 | if (impl->cpu_flags == (impl->cpu_flags & cpu_flags)) { 108 | cycles_t tbest = ~(cycles_t)0; 109 | size_t batch_size = 1, trials = 1; 110 | size_t i; 111 | 112 | /* get a rough estimate for batch size and # of trials */ 113 | for (;;) { 114 | cycles_t tbest = ~(cycles_t)0; 115 | size_t i, j; 116 | for (i = 0; i < 100; i++) { 117 | cycles_t t1 = LOCAL_PREFIX(cpucycles)(); 118 | for (j = 0; j < batch_size; j++) 119 | bench_fn(impl); 120 | t1 = LOCAL_PREFIX(cpucycles)() - t1; 121 | if (t1 < tbest) 122 | tbest = t1; 123 | } 124 | if (tbest > smallest_timeslice * 25) { 125 | trials = (cycles_per_second / tbest); 126 | if (trials < 1) 127 | trials = 1; 128 | break; 129 | } 130 | batch_size = (batch_size == 1) ? 2 : (((batch_size * 4) / 3) + 1); 131 | } 132 | 133 | 134 | 135 | /* measure! */ 136 | for (i = 0; i < trials; i++) { 137 | cycles_t t1 = LOCAL_PREFIX(cpucycles)(); 138 | size_t j; 139 | for (j = 0; j < batch_size; j++) 140 | bench_fn(impl); 141 | t1 = LOCAL_PREFIX(cpucycles)() - t1; 142 | if (t1 < tbest) 143 | tbest = t1; 144 | } 145 | 146 | if (first_item) { 147 | printf("%u %s(s):\n", (unsigned int)units_count, units_desc); 148 | first_item = 0; 149 | } 150 | 151 | printf(" %12s, %8.2f %s per call, %8.4f %s/%s\n", 152 | impl->desc, 153 | (double)tbest / batch_size, cpu_units, 154 | ((double)tbest / batch_size) / units_count, cpu_units, units_desc 155 | ); 156 | } 157 | 158 | if (impl->cpu_flags == CPUID_GENERIC) 159 | return 0; 160 | p += impl_size; 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /framework/driver/arm/cpucycles_impl.inc: -------------------------------------------------------------------------------- 1 | #if defined(HAVE_GETTIMEOFDAY) 2 | #include 3 | #endif 4 | 5 | static cycles_t 6 | cpucycles_impl(void) { 7 | #if defined(HAVE_GETTIMEOFDAY) 8 | struct timeval t; 9 | gettimeofday(&t, NULL); 10 | return ((cycles_t)t.tv_sec * 1000000) + (cycles_t)t.tv_usec; 11 | #else 12 | printf("no suitable timing mechanism found\n"); 13 | exit(1); 14 | return 0; 15 | #endif 16 | } 17 | 18 | static const char * 19 | cpucycles_units_impl(void) { 20 | #if defined(HAVE_GETTIMEOFDAY) 21 | return "us"; 22 | #else 23 | return ""; 24 | #endif 25 | } 26 | 27 | -------------------------------------------------------------------------------- /framework/driver/arm/cpuid_flags.inc: -------------------------------------------------------------------------------- 1 | enum cpuid_flags_arm_t { 2 | CPUID_ARM = (1 << 0), 3 | CPUID_ARMv6 = (1 << 1), 4 | CPUID_ARMv7 = (1 << 2), 5 | CPUID_ARMv8 = (1 << 3), 6 | 7 | CPUID_ASIMD = (1 << 18), 8 | CPUID_TLS = (1 << 19), 9 | CPUID_AES = (1 << 20), 10 | CPUID_PMULL = (1 << 21), 11 | CPUID_SHA1 = (1 << 22), 12 | CPUID_SHA2 = (1 << 23), 13 | CPUID_CRC32 = (1 << 24), 14 | CPUID_IWMMXT = (1 << 25), 15 | CPUID_IDIVT = (1 << 26), 16 | CPUID_IDIVA = (1 << 27), 17 | CPUID_VFP3D16 = (1 << 28), 18 | CPUID_VFP3 = (1 << 29), 19 | CPUID_VFP4 = (1 << 30), 20 | CPUID_NEON = (1 << 31) 21 | }; 22 | -------------------------------------------------------------------------------- /framework/driver/arm/cpuid_impl.inc: -------------------------------------------------------------------------------- 1 | #include "cpuid_impl_linux.inc" 2 | #include "cpuid_impl_msvc.inc" 3 | #include "cpuid_impl_netbsd.inc" 4 | 5 | 6 | static unsigned long 7 | cpuid_impl(void) { 8 | unsigned long flags = cpuid_specific_impl(); 9 | if (flags & CPUID_ARMv8) 10 | flags |= (CPUID_ARMv7 | CPUID_ARMv6); 11 | if (flags & CPUID_ARMv7) 12 | flags |= (CPUID_ARMv6); 13 | if (flags & CPUID_NEON) 14 | flags |= (CPUID_ARMv7 | CPUID_ARMv6 | CPUID_VFP3); 15 | /* vfp3d16 is used for both vfp3 & vfp4 */ 16 | if (flags & CPUID_VFP3D16) 17 | flags &= ~(CPUID_VFP3 | CPUID_VFP4); 18 | return flags; 19 | } 20 | 21 | -------------------------------------------------------------------------------- /framework/driver/arm/cpuid_impl_linux.inc: -------------------------------------------------------------------------------- 1 | #if defined(__linux__) 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define CPUINFO_LINE_LENGTH 128 10 | 11 | typedef struct cpuid_flags_t { 12 | unsigned long processor; 13 | unsigned long features; 14 | unsigned long implementer; 15 | unsigned long arch; 16 | unsigned long variant; 17 | unsigned long part; 18 | unsigned long revision; 19 | } cpuid_flags_t; 20 | 21 | typedef struct cpuid_flag_table_t { 22 | const char *str; 23 | unsigned long flag; 24 | } cpuid_flag_table_t; 25 | 26 | static const cpuid_flag_table_t features[] = { 27 | {"tls ", CPUID_TLS}, 28 | {"aes ", CPUID_AES}, 29 | {"pmull ", CPUID_PMULL}, 30 | {"sha1 ", CPUID_SHA1}, 31 | {"sha2 ", CPUID_SHA2}, 32 | {"crc32 ", CPUID_CRC32}, 33 | {"iwmmxt ", CPUID_IWMMXT}, 34 | {"idivt ", CPUID_IDIVT}, 35 | {"idiva ", CPUID_IDIVA}, 36 | {"vfpv3d16 ", CPUID_VFP3D16}, 37 | {"vfpv3 ", CPUID_VFP3}, 38 | {"vfpv4 ", CPUID_VFP4}, 39 | {"neon ", CPUID_NEON}, 40 | {"asimd ", CPUID_ASIMD}, 41 | {NULL, 0} 42 | }; 43 | 44 | /* that's an L, not a 1?? */ 45 | static const cpuid_flag_table_t processors[] = { 46 | {"(v6l)", CPUID_ARMv6}, 47 | {"(v7l)", CPUID_ARMv7}, 48 | {"(aarch64)", CPUID_ARMv8}, 49 | {NULL, 0} 50 | }; 51 | 52 | static const cpuid_flag_table_t archs[] = { 53 | {"6TEJ", CPUID_ARMv6}, 54 | {"7", CPUID_ARMv7}, 55 | {"7M", CPUID_ARMv7}, 56 | {"AArch64", CPUID_ARMv8}, 57 | {NULL, 0} 58 | }; 59 | 60 | static const char * 61 | cpuid_ltrim(const char *line) { 62 | /* advance to the ':' */ 63 | while (*line && (*line != ':')) 64 | line++; 65 | 66 | if (*line == ':') 67 | line++; 68 | 69 | /* skip whitespace */ 70 | while (*line && ((*line == ' ') || (*line == '\t'))) 71 | line++; 72 | 73 | return line; 74 | } 75 | 76 | static unsigned long 77 | cpuid_parse_unsigned(const char *line) { 78 | unsigned long value = 0; 79 | 80 | if ((line[0] == '0') && (line[1] == 'x')) { 81 | for (line += 2; *line; line++) { 82 | unsigned long digit = *line; 83 | if ((digit - '0') < 10) 84 | digit -= '0'; 85 | else if ((digit - 'A') < 16) 86 | digit -= ('A' - 10); 87 | else if ((digit - 'a') < 16) 88 | digit -= ('a' - 10); 89 | else 90 | return 0; 91 | value = (value * 16) + digit; 92 | } 93 | } else { 94 | for (; *line; line++) { 95 | unsigned long digit = *line; 96 | if ((digit - '0') < 10) 97 | digit -= '0'; 98 | else 99 | return 0; 100 | value = (value * 10) + digit; 101 | } 102 | } 103 | 104 | return value; 105 | } 106 | 107 | static unsigned long 108 | cpuid_scan(const char *line, const cpuid_flag_table_t *table) { 109 | unsigned long flags = 0; 110 | 111 | for (; table->str; table++) { 112 | if (strstr(line, table->str) != NULL) 113 | flags |= table->flag; 114 | } 115 | 116 | return flags; 117 | } 118 | 119 | /* flags: [processor, feature, arch, part] */ 120 | static void 121 | cpuid_line_parse(const char *line, cpuid_flags_t *flags) { 122 | const char *trimmed = cpuid_ltrim(line); 123 | if (strncmp(line, "Processor", 9) == 0) 124 | flags->processor = cpuid_scan(trimmed, processors); 125 | else if (strncmp(line, "Features", 8) == 0) 126 | flags->features |= cpuid_scan(trimmed, features); 127 | else if (strncmp(line, "CPU implementer", 15) == 0) 128 | flags->implementer = cpuid_parse_unsigned(trimmed); 129 | else if (strncmp(line, "CPU architecture", 16) == 0) 130 | flags->arch = cpuid_scan(line + 16, archs); 131 | else if (strncmp(line, "CPU variant", 11) == 0) 132 | flags->variant = cpuid_parse_unsigned(trimmed); 133 | else if (strncmp(line, "CPU part", 8) == 0) 134 | flags->part = cpuid_parse_unsigned(trimmed); 135 | else if (strncmp(line, "CPU revision", 12) == 0) 136 | flags->revision = cpuid_parse_unsigned(trimmed); 137 | } 138 | 139 | static int 140 | cpuid_line_length(char *line, int start, int end) { 141 | int i; 142 | for (i = start; i < end; i++) { 143 | if (line[i] == '\n') { 144 | line[i] = 0; 145 | return i - start; 146 | } 147 | } 148 | return -1; 149 | } 150 | 151 | /* parse /proc/cpuinfo in-place with no allocations */ 152 | static unsigned long 153 | cpuid_specific_impl(void) { 154 | cpuid_flags_t flags = {0, 0, 0, 0, 0, 0, 0}; 155 | int skip_to_next_line = 0; 156 | int incomplete_bytes = 0; 157 | 158 | char line[CPUINFO_LINE_LENGTH]; 159 | int fd; 160 | 161 | fd = open("/proc/cpuinfo", O_RDONLY); 162 | if (fd < 0) 163 | goto cpuid_specific_impl_done; 164 | 165 | for (;;) { 166 | int cur_line_pos = 0; 167 | int bytes_read = read(fd, line + incomplete_bytes, CPUINFO_LINE_LENGTH - incomplete_bytes); 168 | int bytes_left; 169 | int cur_line_end; 170 | 171 | if (bytes_read <= 0) { 172 | if ((bytes_read < 0) && (errno == EINTR)) 173 | continue; 174 | goto cpuid_specific_impl_done; 175 | } 176 | 177 | bytes_left = bytes_read + incomplete_bytes; 178 | cur_line_end = bytes_left; 179 | incomplete_bytes = 0; 180 | while (bytes_left) { 181 | int line_length = cpuid_line_length(line, cur_line_pos, cur_line_end); 182 | 183 | /* if the line extends past the buffer.. */ 184 | if (line_length < 0) { 185 | if (cur_line_pos == 0) { 186 | /* and it's larger than our buffer, skip it */ 187 | skip_to_next_line = 1; 188 | } else { 189 | /* otherwise copy it to the front */ 190 | memmove(line, line + cur_line_pos, CPUINFO_LINE_LENGTH - cur_line_pos); 191 | incomplete_bytes = bytes_left; 192 | line[incomplete_bytes] = 0; 193 | cur_line_pos = 0; 194 | } 195 | 196 | /* break out and read more */ 197 | break; 198 | } 199 | 200 | /* found the end of a line, are we skipping until a new line? */ 201 | if (!skip_to_next_line) 202 | cpuid_line_parse(line + cur_line_pos, &flags); 203 | else 204 | skip_to_next_line = 0; 205 | 206 | cur_line_pos += line_length + 1; 207 | bytes_left -= line_length + 1; 208 | } 209 | } 210 | 211 | cpuid_specific_impl_done: 212 | if (fd != -1) 213 | close(fd); 214 | 215 | /* trust processor over arch, see https://code.google.com/p/android/issues/detail?id=10812 */ 216 | if (!flags.processor) 217 | flags.processor = flags.arch; 218 | 219 | switch (flags.implementer) { 220 | case 0x41: /* ARM */ 221 | /* 0xb02: armv6k - mpcore */ 222 | /* 0xb36: armv6j - arm1136j-s */ 223 | /* 0xb56: armv6t2 - arm1156t2-s */ 224 | /* 0xb76: armv6zk - arm1176jz-s */ 225 | /* 0xc05: armv7-a - cortex-a5 */ 226 | /* 0xc07: armv7ve - cortex-a7 */ 227 | /* 0xc08: armv7-a - cortex-a8 */ 228 | /* 0xc09: armv7-a - cortex-a9 */ 229 | /* 0xc0d: armv7ve - cortex-a12 */ 230 | /* 0xc0f: armv7ve - cortex-a15 */ 231 | /* 0xc14: armv7-r - cortex-r4 */ 232 | /* 0xc15: armv7-r - cortex-r5 */ 233 | /* 0xc20: armv6-m - cortex-m0 */ 234 | /* 0xc21: armv6-m - cortex-m1 */ 235 | /* 0xc23: armv7-m - cortex-m3 */ 236 | /* 0xc24: armv7e-m - cortex-m4 */ 237 | /* 0xc60: armv6-m - cortex-m0+ */ 238 | /* 0xd03: armv8-a - cortex-a53 */ 239 | /* 0xd07: armv8-a - cortex-a57 */ 240 | break; 241 | 242 | case 0x51: /* Qualcomm */ 243 | /* 0x0d4: armv7-a - MSM8960 */ 244 | /* 0x06f: armv7-a - APQ8064 */ 245 | 246 | /* work around faulty neon implementation https://code.google.com/p/chromium/issues/detail?id=341598 */ 247 | if ((flags.arch == CPUID_ARMv7) && (flags.variant == 1) && (flags.part == 0x4d) && (flags.revision == 0)) 248 | flags.features &= ~CPUID_NEON; 249 | break; 250 | 251 | case 0x69: /* Intel */ 252 | break; 253 | 254 | default: 255 | break; 256 | } 257 | 258 | return CPUID_ARM | flags.processor | flags.features; 259 | } 260 | 261 | #endif 262 | -------------------------------------------------------------------------------- /framework/driver/arm/cpuid_impl_msvc.inc: -------------------------------------------------------------------------------- 1 | #if defined(_MSC_VER) 2 | 3 | static unsigned long 4 | cpuid_specific_impl(void) { 5 | unsigned long flags = CPUID_ARM; 6 | 7 | #define CPUID_TEST_FEATURE(feature, value) \ 8 | if (IsProcessorFeaturePresent(feature)) \ 9 | flags |= (value); 10 | 11 | #if defined(PF_NX_ENABLED) 12 | CPUID_TEST_FEATURE(PF_NX_ENABLED, CPUID_ARMv6) 13 | #endif 14 | 15 | #if defined(PF_ARM_V6) 16 | CPUID_TEST_FEATURE(PF_ARM_V6, CPUID_ARMv6) 17 | #endif 18 | 19 | #if defined(PF_ARM_V7) 20 | CPUID_TEST_FEATURE(PF_ARM_V7, CPUID_ARMv7) 21 | #endif 22 | 23 | #if defined(PF_ARM_DIVIDE_INSTRUCTION_AVAILABLE) 24 | CPUID_TEST_FEATURE(PF_ARM_DIVIDE_INSTRUCTION_AVAILABLE, CPUID_IDIVT | CPUID_IDIVA) 25 | #endif 26 | 27 | #if defined(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) 28 | CPUID_TEST_FEATURE(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE, CPUID_NEON) 29 | #endif 30 | 31 | #if defined(PF_ARM_NEON) 32 | CPUID_TEST_FEATURE(PF_ARM_NEON, CPUID_NEON) 33 | #endif 34 | 35 | #if defined(PF_ARM_VFP_32_REGISTERS_AVAILABLE) 36 | CPUID_TEST_FEATURE(PF_ARM_VFP_32_REGISTERS_AVAILABLE, CPUID_VFP3 | CPUID_NEON) 37 | #endif 38 | 39 | return flags; 40 | } 41 | 42 | #endif 43 | 44 | -------------------------------------------------------------------------------- /framework/driver/arm/cpuid_impl_netbsd.inc: -------------------------------------------------------------------------------- 1 | #if defined(__NetBSD__) 2 | 3 | #include 4 | 5 | static unsigned long 6 | cpuid_specific_impl(void) { 7 | unsigned long flags = CPUID_ARM; 8 | size_t len; 9 | int flag; 10 | 11 | len = sizeof(flag); 12 | if (!sysctlbyname("machdep.simdex_present", &flag, &len, NULL, 0) && flag) 13 | flags |= CPUID_ARMv6; 14 | 15 | len = sizeof(flag); 16 | if (!sysctlbyname("machdep.neon_present", &flag, &len, NULL, 0) && flag) 17 | flags |= CPUID_NEON; 18 | 19 | /* should use machdep.cpu_id as well.. */ 20 | 21 | return flags; 22 | } 23 | 24 | #endif -------------------------------------------------------------------------------- /framework/driver/arm/gcc.inc: -------------------------------------------------------------------------------- 1 | #ifndef BASE_GCC_ARM_S 2 | #define BASE_GCC_ARM_S 3 | 4 | #include "asmopt_internal.h" 5 | 6 | #if !defined(HAVE_SLASHMACRO) && !defined(HAVE_DOLLARMACRO) 7 | #error Unknown gnu as macro parameter convention! Run ./configure 8 | #endif 9 | 10 | .syntax unified 11 | .arm 12 | 13 | #define IS_ARM32 (defined(__arm__)) 14 | #define IS_ARM64 (defined(__aarch64__)) 15 | #define IS_ELF (defined(__ELF__)) 16 | #define IS_MACH (defined(__MACH__)) 17 | 18 | #if (IS_ELF) 19 | .macro FN name 20 | .align 2 21 | \name: 22 | _\name: 23 | .endm 24 | 25 | .macro FN_END name 26 | .size \name, .-\name 27 | .type \name STT_FUNC 28 | .size _\name, .-_\name 29 | .type _\name STT_FUNC 30 | .endm 31 | 32 | .macro HIDDEN name 33 | #if defined(HAVE_AS_HIDDEN) 34 | .hidden \name 35 | .hidden _\name 36 | #endif 37 | .endm 38 | 39 | /* set NX for stack */ 40 | .section .note.GNU-stack,"",%progbits 41 | #elif (IS_MACH) 42 | .macro FN name 43 | .align 2 44 | #if defined(HAVE_SLASHMACRO) 45 | \name: 46 | _\name: 47 | #elif defined(HAVE_DOLLARMACRO) 48 | $0: 49 | _$0: 50 | #endif 51 | .endm 52 | 53 | .macro FN_END name 54 | .endm 55 | 56 | .macro HIDDEN name 57 | #if defined(HAVE_AS_PRIVATE_EXTERN) 58 | #if defined(HAVE_SLASHMACRO) 59 | .private_extern \name 60 | .private_extern _\name 61 | #elif defined(HAVE_DOLLARMACRO) 62 | .private_extern $0 63 | .private_extern _$0 64 | #endif 65 | #endif 66 | .endm 67 | #endif 68 | 69 | /* put everything in the code segment to simplify things */ 70 | #if (IS_MACH) 71 | .macro SECTION_TEXT 72 | .section __TEXT,__text,regular 73 | .endm 74 | 75 | .macro SECTION_RODATA 76 | .section __TEXT,__text,regular 77 | .endm 78 | #else 79 | /* put everything in the code segment to simplify things */ 80 | .macro SECTION_TEXT 81 | .text 82 | .endm 83 | 84 | .macro SECTION_RODATA 85 | .text 86 | .endm 87 | #endif 88 | 89 | /* declare a global function */ 90 | .macro GLOBAL name 91 | #if defined(HAVE_SLASHMACRO) 92 | .globl \name 93 | .globl _\name 94 | #elif defined(HAVE_DOLLARMACRO) 95 | .globl $0 96 | .globl _$0 97 | #endif 98 | .endm 99 | 100 | .macro FN_LOCAL_PREFIX name 101 | #if defined(HAVE_SLASHMACRO) 102 | FN LOCAL_PREFIX(\name) 103 | #elif defined(HAVE_DOLLARMACRO) 104 | FN LOCAL_PREFIX($0) 105 | #endif 106 | .endm 107 | 108 | .macro FN_END_LOCAL_PREFIX name 109 | #if defined(HAVE_SLASHMACRO) 110 | FN_END LOCAL_PREFIX(\name) 111 | #elif defined(HAVE_DOLLARMACRO) 112 | FN_END LOCAL_PREFIX($0) 113 | #endif 114 | .endm 115 | 116 | .macro GLOBAL_LOCAL_PREFIX name 117 | #if defined(HAVE_SLASHMACRO) 118 | GLOBAL LOCAL_PREFIX(\name) 119 | HIDDEN LOCAL_PREFIX(\name) 120 | #elif defined(HAVE_DOLLARMACRO) 121 | GLOBAL LOCAL_PREFIX($0) 122 | HIDDEN LOCAL_PREFIX($0) 123 | #endif 124 | .endm 125 | 126 | .macro GLOBAL_HIDDEN_FN name 127 | #if defined(HAVE_SLASHMACRO) 128 | GLOBAL \name 129 | HIDDEN \name 130 | FN \name 131 | #elif defined(HAVE_DOLLARMACRO) 132 | GLOBAL $0 133 | HIDDEN $0 134 | FN $0 135 | #endif 136 | .endm 137 | 138 | /* pic support */ 139 | .macro LOAD_VAR_PIC var, reg 140 | #if (IS_ARM32) 141 | #if defined(HAVE_SLASHMACRO) 142 | adrl \reg, \var 143 | #elif defined(HAVE_DOLLARMACRO) 144 | adrl $1, $0 145 | #endif 146 | #elif (IS_ARM64) 147 | #if defined(HAVE_SLASHMACRO) 148 | adr \reg, \var 149 | #elif defined(HAVE_DOLLARMACRO) 150 | adr $1, $0 151 | #endif 152 | #endif 153 | .endm 154 | 155 | #if defined(HAVE_SLASHMACRO) 156 | #define INCLUDE_FILE_PARM "\file" 157 | #elif defined(HAVE_DOLLARMACRO) 158 | #define INCLUDE_FILE_PARM $0 159 | #endif 160 | 161 | .macro INCLUDE file 162 | .include INCLUDE_FILE_PARM 163 | .endm 164 | 165 | /* include the file with the variable(s) if variable 'name' is not already included */ 166 | .macro INCLUDE_VAR_FILE file, name 167 | #if defined(HAVE_SLASHMACRO) 168 | .ifndef \name 169 | .include INCLUDE_FILE_PARM 170 | .endif 171 | #elif defined(HAVE_DOLLARMACRO) 172 | .ifndef $1 173 | .include INCLUDE_FILE_PARM 174 | .endif 175 | #endif 176 | .endm 177 | 178 | #endif /* BASE_GCC_ARM_S */ 179 | -------------------------------------------------------------------------------- /framework/driver/cpucycles.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cpuid.h" 4 | #include "cpucycles.h" 5 | 6 | 7 | #include "cpucycles_impl.inc" 8 | 9 | cycles_t 10 | LOCAL_PREFIX(cpucycles)(void) { 11 | return cpucycles_impl(); 12 | } 13 | 14 | const char *LOCAL_PREFIX(cpucycles_units)(void) { 15 | return cpucycles_units_impl(); 16 | } 17 | 18 | -------------------------------------------------------------------------------- /framework/driver/cpuid.c: -------------------------------------------------------------------------------- 1 | #include "cpuid.h" 2 | 3 | #include "cpuid_impl.inc" 4 | 5 | static unsigned long cpuid_flags = CPUID_GENERIC; 6 | static unsigned long cpuid_mask = ~(unsigned long)0; 7 | 8 | unsigned long 9 | LOCAL_PREFIX(cpuid)(void) { 10 | if (cpuid_flags == CPUID_GENERIC) 11 | cpuid_flags = cpuid_impl(); 12 | return cpuid_flags & cpuid_mask; 13 | } 14 | 15 | const void * 16 | LOCAL_PREFIX(cpu_select)(const void *impls, size_t impl_size, impl_test test_fn) { 17 | unsigned long cpu_flags = LOCAL_PREFIX(cpuid)(); 18 | const unsigned char *p = (const unsigned char *)impls; 19 | for (;;) { 20 | const cpu_specific_impl_t *impl = (const cpu_specific_impl_t *)p; 21 | if (impl->cpu_flags == (impl->cpu_flags & cpu_flags)) { 22 | if (test_fn(impl) == 0) 23 | return impl; 24 | } 25 | if (impl->cpu_flags == CPUID_GENERIC) 26 | return NULL; 27 | p += impl_size; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /framework/driver/gcc_driver.inc: -------------------------------------------------------------------------------- 1 | #ifndef GCC_DRIVER_INC 2 | #define GCC_DRIVER_INC 3 | 4 | #include "asmopt_internal.h" 5 | 6 | #if (defined(__i386__ ) || defined(__x86_64__)) 7 | #include "x86/gcc.inc" 8 | #endif 9 | 10 | #if (defined(__arm__ ) || defined(__aarch64__)) 11 | #include "arm/gcc.inc" 12 | #endif 13 | 14 | .macro INCLUDE_IF_X86_32BIT file 15 | #if (IS_X86_32) 16 | .include INCLUDE_FILE_PARM 17 | #endif 18 | .endm 19 | 20 | .macro INCLUDE_IF_X86_64BIT file 21 | #if (IS_X86_64) 22 | .include INCLUDE_FILE_PARM 23 | #endif 24 | .endm 25 | 26 | 27 | .macro INCLUDE_IF_MMX_32BIT file 28 | #if defined(HAVE_MMX) 29 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM 30 | #endif 31 | .endm 32 | 33 | .macro INCLUDE_IF_MMX_64BIT file 34 | #if defined(HAVE_MMX) 35 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM 36 | #endif 37 | .endm 38 | 39 | 40 | .macro INCLUDE_IF_SSE_32BIT file 41 | #if defined(HAVE_SSE) 42 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM 43 | #endif 44 | .endm 45 | 46 | .macro INCLUDE_IF_SSE_64BIT file 47 | #if defined(HAVE_SSE) 48 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM 49 | #endif 50 | .endm 51 | 52 | 53 | .macro INCLUDE_IF_SSE2_32BIT file 54 | #if defined(HAVE_SSE2) 55 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM 56 | #endif 57 | .endm 58 | 59 | .macro INCLUDE_IF_SSE2_64BIT file 60 | #if defined(HAVE_SSE2) 61 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM 62 | #endif 63 | .endm 64 | 65 | 66 | .macro INCLUDE_IF_SSE3_32BIT file 67 | #if defined(HAVE_SSE3) 68 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM 69 | #endif 70 | .endm 71 | 72 | .macro INCLUDE_IF_SSE3_64BIT file 73 | #if defined(HAVE_SSE3) 74 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM 75 | #endif 76 | .endm 77 | 78 | 79 | .macro INCLUDE_IF_SSSE3_32BIT file 80 | #if defined(HAVE_SSSE3) 81 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM 82 | #endif 83 | .endm 84 | 85 | .macro INCLUDE_IF_SSSE3_64BIT file 86 | #if defined(HAVE_SSSE3) 87 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM 88 | #endif 89 | .endm 90 | 91 | 92 | .macro INCLUDE_IF_SSE4_1_32BIT file 93 | #if defined(HAVE_SSE4_1) 94 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM 95 | #endif 96 | .endm 97 | 98 | .macro INCLUDE_IF_SSE4_1_64BIT file 99 | #if defined(HAVE_SSE4_1) 100 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM 101 | #endif 102 | .endm 103 | 104 | 105 | .macro INCLUDE_IF_SSE4_2_32BIT file 106 | #if defined(HAVE_SSE4_2) 107 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM 108 | #endif 109 | .endm 110 | 111 | .macro INCLUDE_IF_SSE4_2_64BIT file 112 | #if defined(HAVE_SSE4_2) 113 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM 114 | #endif 115 | .endm 116 | 117 | 118 | .macro INCLUDE_IF_AVX_32BIT file 119 | #if defined(HAVE_AVX) 120 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM 121 | #endif 122 | .endm 123 | 124 | .macro INCLUDE_IF_AVX_64BIT file 125 | #if defined(HAVE_AVX) 126 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM 127 | #endif 128 | .endm 129 | 130 | 131 | .macro INCLUDE_IF_XOP_32BIT file 132 | #if defined(HAVE_XOP) 133 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM 134 | #endif 135 | .endm 136 | 137 | .macro INCLUDE_IF_XOP_64BIT file 138 | #if defined(HAVE_XOP) 139 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM 140 | #endif 141 | .endm 142 | 143 | 144 | .macro INCLUDE_IF_AVX2_32BIT file 145 | #if defined(HAVE_AVX2) 146 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM 147 | #endif 148 | .endm 149 | 150 | .macro INCLUDE_IF_AVX2_64BIT file 151 | #if defined(HAVE_AVX2) 152 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM 153 | #endif 154 | .endm 155 | 156 | 157 | .macro INCLUDE_IF_AVX512_32BIT file 158 | #if defined(HAVE_AVX512) 159 | INCLUDE_IF_X86_32BIT INCLUDE_FILE_PARM 160 | #endif 161 | .endm 162 | 163 | .macro INCLUDE_IF_AVX512_64BIT file 164 | #if defined(HAVE_AVX512) 165 | INCLUDE_IF_X86_64BIT INCLUDE_FILE_PARM 166 | #endif 167 | .endm 168 | 169 | #endif /* GCC_DRIVER_INC */ 170 | 171 | -------------------------------------------------------------------------------- /framework/driver/generic/cpucycles_impl.inc: -------------------------------------------------------------------------------- 1 | #if defined(HAVE_GETTIMEOFDAY) 2 | #include 3 | #endif 4 | 5 | static cycles_t 6 | cpucycles_impl(void) { 7 | #if defined(HAVE_GETTIMEOFDAY) 8 | struct timeval t; 9 | gettimeofday(&t, NULL); 10 | return ((cycles_t)t.tv_sec * 1000000) + (cycles_t)t.tv_usec; 11 | #else 12 | printf("no suitable timing mechanism found\n"); 13 | exit(1); 14 | return 0; 15 | #endif 16 | } 17 | 18 | static const char * 19 | cpucycles_units_impl(void) { 20 | #if defined(HAVE_GETTIMEOFDAY) 21 | return "us"; 22 | #else 23 | return ""; 24 | #endif 25 | } 26 | 27 | -------------------------------------------------------------------------------- /framework/driver/generic/cpuid_flags.inc: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /framework/driver/generic/cpuid_impl.inc: -------------------------------------------------------------------------------- 1 | static unsigned long long 2 | cpuid_impl(void) { 3 | return CPUID_GENERIC; 4 | } 5 | -------------------------------------------------------------------------------- /framework/driver/x86/cpucycles_impl.inc: -------------------------------------------------------------------------------- 1 | typedef cycles_t (*cpucycles_x86_fn)(void); 2 | 3 | extern cycles_t LOCAL_PREFIX(cpucycles_x86)(void); 4 | static cycles_t cpucycles_select(void); 5 | 6 | static cpucycles_x86_fn cpucycles_impl = cpucycles_select; 7 | 8 | #if defined(HAVE_GETTIMEOFDAY) 9 | #include 10 | 11 | static cycles_t 12 | cpucycles_x86_fallback(void) { 13 | struct timeval t; 14 | gettimeofday(&t, NULL); 15 | return ((cycles_t)t.tv_sec * 1000000) + (cycles_t)t.tv_usec; 16 | } 17 | #else 18 | /* what can a 386/486 use for this otherwise? */ 19 | static cycles_t 20 | cpucycles_x86_fallback(void) { 21 | printf("no suitable timing mechanism found\n"); 22 | exit(1); 23 | } 24 | #endif 25 | 26 | static cycles_t 27 | cpucycles_select(void) { 28 | cpucycles_impl = (LOCAL_PREFIX(cpuid)() & CPUID_RDTSC) ? LOCAL_PREFIX(cpucycles_x86) : cpucycles_x86_fallback; 29 | return cpucycles_impl(); 30 | } 31 | 32 | static const char * 33 | cpucycles_units_impl(void) { 34 | if ((LOCAL_PREFIX(cpuid)() & CPUID_RDTSC)) 35 | return "cycles"; 36 | else 37 | #if defined(HAVE_GETTIMEOFDAY) 38 | return "us"; 39 | #else 40 | return ""; 41 | #endif 42 | } 43 | 44 | -------------------------------------------------------------------------------- /framework/driver/x86/cpuid_flags.inc: -------------------------------------------------------------------------------- 1 | enum cpuid_flags_x86_t { 2 | CPUID_X86 = (1 << 0), 3 | CPUID_MMX = (1 << 1), 4 | CPUID_SSE = (1 << 2), 5 | CPUID_SSE2 = (1 << 3), 6 | CPUID_SSE3 = (1 << 4), 7 | CPUID_SSSE3 = (1 << 5), 8 | CPUID_SSE4_1 = (1 << 6), 9 | CPUID_SSE4_2 = (1 << 7), 10 | CPUID_AVX = (1 << 8), 11 | CPUID_XOP = (1 << 9), 12 | CPUID_AVX2 = (1 << 10), 13 | CPUID_AVX512 = (1 << 11), 14 | 15 | CPUID_RDTSC = (1 << 25), 16 | CPUID_RDRAND = (1 << 26), 17 | CPUID_POPCNT = (1 << 27), 18 | CPUID_FMA4 = (1 << 28), 19 | CPUID_FMA3 = (1 << 29), 20 | CPUID_PCLMULQDQ = (1 << 30), 21 | CPUID_AES = (1 << 31) 22 | }; 23 | 24 | -------------------------------------------------------------------------------- /framework/driver/x86/cpuid_impl.inc: -------------------------------------------------------------------------------- 1 | extern uint32_t LOCAL_PREFIX(cpuid_x86)(void); 2 | 3 | static uint32_t 4 | cpuid_impl(void) { 5 | return LOCAL_PREFIX(cpuid_x86)(); 6 | } 7 | 8 | -------------------------------------------------------------------------------- /framework/driver/x86/driver.S: -------------------------------------------------------------------------------- 1 | #if defined(__GNUC__) 2 | #include "gcc_driver.inc" 3 | #else 4 | ;.if 0 5 | %include "yasm_driver.inc" 6 | ;.endif 7 | #endif 8 | 9 | SECTION_TEXT 10 | 11 | GLOBAL_LOCAL_PREFIX cpuid_x86 12 | FN_LOCAL_PREFIX cpuid_x86 13 | CPUID_PROLOGUE 14 | 15 | /* use esi for flags */ 16 | movl $(CPUID_X86), %esi 17 | 18 | /* cpuid 0 */ 19 | movl $0, %eax 20 | xorl %ecx, %ecx 21 | cpuid 22 | 23 | /* eax = max level, store in edi */ 24 | movl %eax, %edi 25 | 26 | /* cpus with >=2 cpuid levels support rdtsc */ 27 | cmpl $2, %edi 28 | jb 1f 29 | orl $(CPUID_RDTSC), %esi 30 | 1: 31 | 32 | testl $0x00000500, %edi 33 | jz Lcpuid_x86_notp5 34 | 35 | /* Intel P5 pre-B0, only MMX */ 36 | orl $(CPUID_MMX), %esi 37 | jmp Lcpuid_x86_done 38 | 39 | Lcpuid_x86_notp5: 40 | 41 | /* cpuid 1 */ 42 | movl $1, %eax 43 | xorl %ecx, %ecx 44 | cpuid 45 | 46 | /* rdrand */ 47 | testl $(1 << 30), %ecx 48 | jz 1f 49 | orl $(CPUID_RDRAND), %esi 50 | 1: 51 | 52 | /* aes */ 53 | testl $(1 << 25), %ecx 54 | jz 1f 55 | orl $(CPUID_AES), %esi 56 | 1: 57 | 58 | /* popcnt */ 59 | testl $(1 << 23), %ecx 60 | jz 1f 61 | orl $(CPUID_POPCNT), %esi 62 | 1: 63 | 64 | /* fma3 */ 65 | testl $(1 << 12), %ecx 66 | jz 1f 67 | orl $(CPUID_FMA3), %esi 68 | 1: 69 | 70 | /* pclmulqdq */ 71 | testl $(1 << 1), %ecx 72 | jz 1f 73 | orl $(CPUID_PCLMULQDQ), %esi 74 | 1: 75 | 76 | /* SSE4.2 */ 77 | testl $(1 << 20), %ecx 78 | jz 1f 79 | orl $(CPUID_SSE4_2), %esi 80 | 1: 81 | 82 | /* SSE4.1 */ 83 | testl $(1 << 19), %ecx 84 | jz 1f 85 | orl $(CPUID_SSE4_1), %esi 86 | 1: 87 | 88 | /* SSSE3 */ 89 | testl $(1 << 9), %ecx 90 | jz 1f 91 | orl $(CPUID_SSSE3), %esi 92 | 1: 93 | 94 | /* SSE3 */ 95 | testl $(1 ), %ecx 96 | jz 1f 97 | orl $(CPUID_SSE3), %esi 98 | 1: 99 | 100 | /* SSE2 */ 101 | testl $(1 << 26), %edx 102 | jz 1f 103 | orl $(CPUID_SSE2), %esi 104 | 1: 105 | 106 | /* SSE */ 107 | testl $(1 << 25), %edx 108 | jz 1f 109 | orl $(CPUID_SSE), %esi 110 | 1: 111 | 112 | /* MMX */ 113 | testl $(1 << 23), %edx 114 | jz 1f 115 | orl $(CPUID_MMX), %esi 116 | 1: 117 | 118 | /* test for xsave enabled by os */ 119 | testl $(1 << 27), %ecx 120 | jz Lcpuid_x86_skipavxplus 121 | 122 | /* test for avx supported by cpu */ 123 | testl $(1 << 28), %ecx 124 | jz Lcpuid_x86_skipavxplus 125 | 126 | /* xgetbv(0) */ 127 | xorl %ecx, %ecx 128 | .byte 0x0f, 0x01, 0xd0 129 | 130 | /* save XCR0 in scratch(ebp) */ 131 | movl %eax, %ebp 132 | 133 | /* XCR0 & (XMM | YMM) */ 134 | andl $((1 << 2) | (1 << 1)), %eax 135 | cmpl $((1 << 2) | (1 << 1)), %eax 136 | jne Lcpuid_x86_skipavxplus 137 | 138 | /* AVX is ok to use */ 139 | orl $(CPUID_AVX), %esi 140 | 141 | /* check for max level >= 7 */ 142 | cmpl $7, %edi 143 | jb Lcpuid_x86_cpuid_below_7 144 | 145 | /* cpuid 7 */ 146 | movl $7, %eax 147 | xorl %ecx, %ecx 148 | cpuid 149 | 150 | /* AVX2 */ 151 | testl $(1 << 5), %ebx 152 | jz 1f 153 | orl $(CPUID_AVX2), %esi 154 | 1: 155 | 156 | /* XCR0 & (OPMASK | ZMMUPPER | ZMMEXTENDED) */ 157 | andl $((1 << 5) | (1 << 6) | (1 << 7)), %ebp 158 | cmpl $((1 << 5) | (1 << 6) | (1 << 7)), %ebp 159 | jne Lcpuid_x86_skipavx512 160 | 161 | /* AVX-512 */ 162 | testl $(1 << 16), %ebx 163 | jz 1f 164 | orl $(CPUID_AVX512), %esi 165 | 1: 166 | 167 | Lcpuid_x86_skipavx512: 168 | 169 | Lcpuid_x86_cpuid_below_7: 170 | 171 | /* cpuid 0x80000000 */ 172 | movl $0x80000000, %eax 173 | xorl %ecx, %ecx 174 | cpuid 175 | 176 | /* eax = max extended level */ 177 | cmpl $0x80000001, %eax 178 | jb Lcpuid_x86_skipxopplus 179 | 180 | /* cpuid $0x80000001 */ 181 | movl $0x80000001, %eax 182 | xorl %ecx, %ecx 183 | cpuid 184 | 185 | /* fma4 */ 186 | testl $(1 << 16), %ecx 187 | jz 1f 188 | orl $(CPUID_FMA4), %esi 189 | 1: 190 | 191 | /* XOP */ 192 | testl $(1 << 11), %ecx 193 | jz 1f 194 | orl $(CPUID_XOP), %esi 195 | 1: 196 | 197 | Lcpuid_x86_skipxopplus: 198 | 199 | Lcpuid_x86_skipavxplus: 200 | 201 | Lcpuid_x86_done: 202 | movl %esi, %eax 203 | 204 | CPUID_EPILOGUE 205 | FN_END_LOCAL_PREFIX cpuid_x86 206 | 207 | 208 | 209 | GLOBAL_LOCAL_PREFIX cpucycles_x86 210 | FN_LOCAL_PREFIX cpucycles_x86 211 | 212 | CPUCYCLES 213 | 214 | FN_END_LOCAL_PREFIX cpucycles_x86 215 | -------------------------------------------------------------------------------- /framework/driver/x86/gcc.inc: -------------------------------------------------------------------------------- 1 | #ifndef BASE_GCC_X86_S 2 | #define BASE_GCC_X86_S 3 | 4 | #include "asmopt_internal.h" 5 | 6 | #if !defined(HAVE_SLASHMACRO) && !defined(HAVE_DOLLARMACRO) 7 | #error Unknown gnu as macro parameter convention! Run ./configure 8 | #endif 9 | 10 | #define IS_X86_32 (defined(__i386__)) 11 | #define IS_X86_64 (defined(__x86_64__)) 12 | #define IS_ELF (defined(__ELF__)) 13 | #define IS_MACH (defined(__MACH__)) 14 | #define IS_WIN32 (IS_X86_32 && (defined(_WIN32) || defined(__CYGWIN__))) 15 | #define IS_WIN64 (IS_X86_64 && (defined(_WIN64) || defined(__CYGWIN__))) 16 | 17 | #if (IS_WIN64) 18 | /* handles 0-6 arguments and optional saving of the upper 8 xmm registers */ 19 | .macro WIN64STUBFN name, args, xmmused 20 | \name:; _\name:; 21 | 22 | subq $(184), %rsp 23 | movdqa %xmm6, 0(%rsp) 24 | movdqa %xmm7, 16(%rsp) 25 | .if \xmmused > 8 26 | movdqa %xmm8, 32(%rsp) 27 | movdqa %xmm9, 48(%rsp) 28 | movdqa %xmm10, 64(%rsp) 29 | movdqa %xmm11, 80(%rsp) 30 | movdqa %xmm12, 96(%rsp) 31 | movdqa %xmm13, 112(%rsp) 32 | movdqa %xmm14, 128(%rsp) 33 | movdqa %xmm15, 144(%rsp) 34 | .endif 35 | movq %rdi, 160(%rsp) 36 | movq %rsi, 168(%rsp) 37 | movq %rcx, %rdi 38 | movq %rdx, %rsi 39 | movq %r8, %rdx 40 | movq %r9, %rcx 41 | .if \args >= 5 42 | movq 224(%rsp), %r8 43 | .endif 44 | .if \args >= 6 45 | movq 232(%rsp), %r9 46 | .endif 47 | call thunk_\name 48 | movdqa 0(%rsp), %xmm6 49 | movdqa 16(%rsp), %xmm7 50 | .if \xmmused > 8 51 | movdqa 32(%rsp), %xmm8 52 | movdqa 48(%rsp), %xmm9 53 | movdqa 64(%rsp), %xmm10 54 | movdqa 80(%rsp), %xmm11 55 | movdqa 96(%rsp), %xmm12 56 | movdqa 112(%rsp), %xmm13 57 | movdqa 128(%rsp), %xmm14 58 | movdqa 144(%rsp), %xmm15 59 | .endif 60 | movq 160(%rsp), %rdi 61 | movq 168(%rsp), %rsi 62 | addq $(184), %rsp 63 | ret 64 | thunk_\name: 65 | .endm 66 | 67 | .macro FN name 68 | WIN64STUBFN \name, 4, 16 69 | .endm 70 | 71 | .macro FN_EXT name, args, xmmused 72 | WIN64STUBFN \name, \args, \xmmused 73 | .endm 74 | 75 | .macro FN_END name 76 | .endm 77 | 78 | .macro HIDDEN name 79 | .endm 80 | #elif (IS_WIN32) 81 | .macro FN name 82 | \name: 83 | _\name: 84 | .endm 85 | 86 | .macro FN_EXT name, args, xmmused 87 | FN \name 88 | .endm 89 | 90 | .macro FN_END name 91 | .endm 92 | 93 | .macro HIDDEN name 94 | .endm 95 | #elif (IS_ELF) 96 | .macro FN name 97 | \name: 98 | _\name: 99 | .endm 100 | 101 | .macro FN_EXT name, args, xmmused 102 | FN \name 103 | .endm 104 | 105 | .macro FN_END name 106 | .size \name, .-\name 107 | .size _\name, .-_\name 108 | .type \name, @function 109 | .type _\name, @function 110 | .endm 111 | 112 | .macro HIDDEN name 113 | #if defined(HAVE_AS_HIDDEN) 114 | .hidden \name 115 | .hidden _\name 116 | #endif 117 | .endm 118 | 119 | /* set NX for stack */ 120 | .section .note.GNU-stack,"",@progbits 121 | #elif (IS_MACH) 122 | .macro FN name 123 | #if defined(HAVE_SLASHMACRO) 124 | \name: 125 | _\name: 126 | #elif defined(HAVE_DOLLARMACRO) 127 | $0: 128 | _$0: 129 | #endif 130 | .endm 131 | 132 | .macro FN_EXT name, args, xmmused 133 | #if defined(HAVE_SLASHMACRO) 134 | FN \name 135 | #elif defined(HAVE_DOLLARMACRO) 136 | FN $0 137 | #endif 138 | .endm 139 | 140 | .macro FN_END name 141 | .endm 142 | 143 | .macro HIDDEN name 144 | #if defined(HAVE_AS_PRIVATE_EXTERN) 145 | #if defined(HAVE_SLASHMACRO) 146 | .private_extern \name 147 | .private_extern _\name 148 | #elif defined(HAVE_DOLLARMACRO) 149 | .private_extern $0 150 | .private_extern _$0 151 | #endif 152 | #endif 153 | .endm 154 | #endif 155 | 156 | /* put everything in the code segment to simplify things */ 157 | #if (IS_MACH) 158 | .macro SECTION_TEXT 159 | .section __TEXT,__text,regular 160 | .endm 161 | 162 | .macro SECTION_RODATA 163 | .section __TEXT,__text,regular 164 | .endm 165 | #else 166 | /* put everything in the code segment to simplify things */ 167 | .macro SECTION_TEXT 168 | .text 169 | .endm 170 | 171 | .macro SECTION_RODATA 172 | .text 173 | .endm 174 | #endif 175 | 176 | /* declare a global function */ 177 | .macro GLOBAL name 178 | #if defined(HAVE_SLASHMACRO) 179 | .globl \name 180 | .globl _\name 181 | #elif defined(HAVE_DOLLARMACRO) 182 | .globl $0 183 | .globl _$0 184 | #endif 185 | .endm 186 | 187 | .macro FN_LOCAL_PREFIX name 188 | #if defined(HAVE_SLASHMACRO) 189 | FN LOCAL_PREFIX(\name) 190 | #elif defined(HAVE_DOLLARMACRO) 191 | FN LOCAL_PREFIX($0) 192 | #endif 193 | .endm 194 | 195 | .macro FN_EXT_LOCAL_PREFIX name, args, xmmused 196 | #if defined(HAVE_SLASHMACRO) 197 | FN_EXT LOCAL_PREFIX(\name), \args, \xmmused 198 | #elif defined(HAVE_DOLLARMACRO) 199 | FN_EXT LOCAL_PREFIX($0), $1, $2 200 | #endif 201 | .endm 202 | 203 | .macro FN_END_LOCAL_PREFIX name 204 | #if defined(HAVE_SLASHMACRO) 205 | FN_END LOCAL_PREFIX(\name) 206 | #elif defined(HAVE_DOLLARMACRO) 207 | FN_END LOCAL_PREFIX($0) 208 | #endif 209 | .endm 210 | 211 | .macro GLOBAL_LOCAL_PREFIX name 212 | #if defined(HAVE_SLASHMACRO) 213 | GLOBAL LOCAL_PREFIX(\name) 214 | HIDDEN LOCAL_PREFIX(\name) 215 | #elif defined(HAVE_DOLLARMACRO) 216 | GLOBAL LOCAL_PREFIX($0) 217 | HIDDEN LOCAL_PREFIX($0) 218 | #endif 219 | .endm 220 | 221 | .macro GLOBAL_HIDDEN_FN name 222 | #if defined(HAVE_SLASHMACRO) 223 | GLOBAL \name 224 | HIDDEN \name 225 | FN \name 226 | #elif defined(HAVE_DOLLARMACRO) 227 | GLOBAL $0 228 | HIDDEN $0 229 | FN $0 230 | #endif 231 | .endm 232 | 233 | .macro GLOBAL_HIDDEN_FN_EXT name, args, xmmused 234 | #if defined(HAVE_SLASHMACRO) 235 | GLOBAL \name 236 | HIDDEN \name 237 | FN_EXT \name, \args, \xmmused 238 | #elif defined(HAVE_DOLLARMACRO) 239 | GLOBAL $0 240 | HIDDEN $0 241 | FN_EXT $0, $1, $2 242 | #endif 243 | .endm 244 | 245 | /* pic support */ 246 | .macro LOAD_VAR_PIC var, reg 247 | #if (IS_X86_32) 248 | #if defined(HAVE_SLASHMACRO) 249 | call 1f 250 | 1: 251 | popl \reg 252 | leal \var - 1b(\reg), \reg 253 | #elif defined(HAVE_DOLLARMACRO) 254 | call 1f 255 | 1: 256 | popl $1 257 | leal $0 - 1b($1), $1 258 | #endif 259 | #else 260 | #if defined(HAVE_SLASHMACRO) 261 | leaq \var(%rip), \reg 262 | #elif defined(HAVE_DOLLARMACRO) 263 | leaq $0(%rip), $1 264 | #endif 265 | #endif 266 | .endm 267 | 268 | #if defined(HAVE_SLASHMACRO) 269 | #define INCLUDE_FILE_PARM "\file" 270 | #elif defined(HAVE_DOLLARMACRO) 271 | #define INCLUDE_FILE_PARM $0 272 | #endif 273 | 274 | 275 | .macro INCLUDE file 276 | .include INCLUDE_FILE_PARM 277 | .endm 278 | 279 | /* include the file with the variable(s) if variable 'name' is not already included */ 280 | .macro INCLUDE_VAR_FILE file, name 281 | #if defined(HAVE_SLASHMACRO) 282 | .ifndef \name 283 | .include INCLUDE_FILE_PARM 284 | .endif 285 | #elif defined(HAVE_DOLLARMACRO) 286 | .ifndef $1 287 | .include INCLUDE_FILE_PARM 288 | .endif 289 | #endif 290 | .endm 291 | 292 | /* stupid helpers so we can have cpuid in one file */ 293 | 294 | .macro CPUID_PROLOGUE 295 | #if (IS_X86_32) 296 | pushl %ebx 297 | pushl %esi 298 | pushl %edi 299 | pushl %ebp 300 | 301 | /* check that cpuid is supported */ 302 | pushfl 303 | popl %eax 304 | movl %eax, %ecx 305 | xorl $(0x200000), %eax 306 | pushl %eax 307 | popfl 308 | pushfl 309 | popl %eax 310 | xorl %ecx, %eax 311 | shrl $(21), %eax 312 | andl $(1), %eax 313 | pushl %ecx 314 | popfl 315 | andl %eax, %eax 316 | jz Lcpuid_x86_done 317 | #else 318 | pushq %rbx 319 | pushq %rsi 320 | pushq %rdi 321 | pushq %rbp 322 | #endif 323 | .endm 324 | 325 | .macro CPUID_EPILOGUE 326 | #if (IS_X86_32) 327 | popl %ebp 328 | popl %edi 329 | popl %esi 330 | popl %ebx 331 | #else 332 | popq %rbp 333 | popq %rdi 334 | popq %rsi 335 | popq %rbx 336 | #endif 337 | ret 338 | .endm 339 | 340 | .macro CPUCYCLES 341 | rdtsc 342 | #if (IS_X86_64) 343 | shlq $(32), %rdx 344 | orq %rdx, %rax 345 | #endif 346 | ret 347 | .endm 348 | 349 | 350 | /* Macros for CPUID only */ 351 | 352 | #define CPUID_GENERIC (0 ) 353 | #define CPUID_X86 (1 << 0) 354 | #define CPUID_MMX (1 << 1) 355 | #define CPUID_SSE (1 << 2) 356 | #define CPUID_SSE2 (1 << 3) 357 | #define CPUID_SSE3 (1 << 4) 358 | #define CPUID_SSSE3 (1 << 5) 359 | #define CPUID_SSE4_1 (1 << 6) 360 | #define CPUID_SSE4_2 (1 << 7) 361 | #define CPUID_AVX (1 << 8) 362 | #define CPUID_XOP (1 << 9) 363 | #define CPUID_AVX2 (1 << 10) 364 | #define CPUID_AVX512 (1 << 11) 365 | 366 | #define CPUID_RDTSC (1 << 25) 367 | #define CPUID_RDRAND (1 << 26) 368 | #define CPUID_POPCNT (1 << 27) 369 | #define CPUID_FMA4 (1 << 28) 370 | #define CPUID_FMA3 (1 << 29) 371 | #define CPUID_PCLMULQDQ (1 << 30) 372 | #define CPUID_AES (1 << 31) 373 | 374 | #endif /* BASE_GCC_X86_S */ 375 | -------------------------------------------------------------------------------- /framework/driver/x86/yasm.inc: -------------------------------------------------------------------------------- 1 | %ifndef BASE_YASM 2 | %define BASE_YASM 3 | 4 | ; 1.1.0 and earlier incorrectly parsed movsw/movzw in gas mode: https://github.com/yasm/yasm/commit/2678cb3c3a42b3870a209ed8de38c1a16449695a 5 | %if (__YASM_VERSION_ID__ < 01020000h) ; 1.2.0 6 | %error Requires Yasm 1.2.0 or higher 7 | %endif 8 | 9 | %define HAVE_XOP 0 10 | %define HAVE_AVX2 0 11 | %define HAVE_AVX512 0 12 | 13 | %if (__YASM_VERSION_ID__ >= 01000000h) ; 1.0.0 14 | %define HAVE_XOP 1 15 | %endif 16 | 17 | %if (__YASM_VERSION_ID__ >= 01020000h) ; 1.2.0 18 | %define HAVE_AVX2 1 19 | %endif 20 | 21 | %if (__YASM_VERSION_ID__ >= 999999999) ; avx-512 isn't supported yet 22 | %define HAVE_AVX512 1 23 | %endif 24 | 25 | 26 | %define BITS32 0 27 | %define BITS64 0 28 | %define WIN 0 29 | %define ELF 0 30 | %define MACH 0 31 | 32 | %ifidn __YASM_OBJFMT__, win32 33 | %define BITS32 1 34 | %define WIN 1 35 | %elifidn __YASM_OBJFMT__, elf 36 | %error Specify bits with -f [elf32,elf64] 37 | %elifidn __YASM_OBJFMT__, elf32 38 | %define BITS32 1 39 | %define ELF 1 40 | %elifidn __YASM_OBJFMT__, macho 41 | %error Specify bits with -f [macho32,macho64] 42 | %elifidn __YASM_OBJFMT__, macho32 43 | %define BITS32 1 44 | %define MACH 1 45 | %elifidn __YASM_OBJFMT__, win64 46 | %define BITS64 1 47 | %define WIN 1 48 | %elifidn __YASM_OBJFMT__, x64 49 | %define BITS64 1 50 | %define WIN 1 51 | %elifidn __YASM_OBJFMT__, elf64 52 | %define BITS64 1 53 | %define ELF 1 54 | %elifidn __YASM_OBJFMT__, macho64 55 | %define BITS64 1 56 | %define MACH 1 57 | %else 58 | %error "Unable to determine output format" 59 | %endif 60 | 61 | %if (WIN) 62 | %if (BITS64) 63 | ; name, args, xmmused 64 | %macro win64stubfn 3 65 | %1: 66 | _ %+ %1: 67 | 68 | subq $184, %rsp 69 | movdqa %xmm6, 0(%rsp) 70 | movdqa %xmm7, 16(%rsp) 71 | %if (%3 > 8) 72 | movdqa %xmm8, 32(%rsp) 73 | movdqa %xmm9, 48(%rsp) 74 | movdqa %xmm10, 64(%rsp) 75 | movdqa %xmm11, 80(%rsp) 76 | movdqa %xmm12, 96(%rsp) 77 | movdqa %xmm13, 112(%rsp) 78 | movdqa %xmm14, 128(%rsp) 79 | movdqa %xmm15, 144(%rsp) 80 | %endif 81 | movq %rdi, 160(%rsp) 82 | movq %rsi, 168(%rsp) 83 | movq %rcx, %rdi 84 | movq %rdx, %rsi 85 | movq %r8, %rdx 86 | movq %r9, %rcx 87 | %if (%2 >= 5) 88 | movq 224(%rsp), %r8 89 | %endif 90 | %if (%2 >= 6) 91 | movq 232(%rsp), %r9 92 | %endif 93 | call thunk_ %+ %1 94 | movdqa 0(%rsp), %xmm6 95 | movdqa 16(%rsp), %xmm7 96 | %if (%3 > 8) 97 | movdqa 32(%rsp), %xmm8 98 | movdqa 48(%rsp), %xmm9 99 | movdqa 64(%rsp), %xmm10 100 | movdqa 80(%rsp), %xmm11 101 | movdqa 96(%rsp), %xmm12 102 | movdqa 112(%rsp), %xmm13 103 | movdqa 128(%rsp), %xmm14 104 | movdqa 144(%rsp), %xmm15 105 | %endif 106 | movq 160(%rsp), %rdi 107 | movq 168(%rsp), %rsi 108 | addq $184, %rsp 109 | ret 110 | thunk_ %+ %1: 111 | %endmacro 112 | 113 | ; FN name 114 | %macro FN 1 115 | win64stubfn %1, 4, 16 116 | %endmacro 117 | 118 | ; FN_EXT name, args, xmmused 119 | %macro FN_EXT 3 120 | win64stubfn %1, %2, %3 121 | %endmacro 122 | 123 | ; FN_END name 124 | %macro FN_END 1 125 | %endmacro 126 | %else 127 | ; FN name 128 | %macro FN 1 129 | %1: 130 | _ %+ %1: 131 | %endmacro 132 | 133 | ; FN_EXT name, args, xmmused 134 | %macro FN_EXT 3 135 | %1: 136 | _ %+ %1: 137 | %endmacro 138 | 139 | ; FN_END name 140 | %macro FN_END 1 141 | %endmacro 142 | %endif 143 | 144 | %macro HIDDEN 1 145 | %endmacro 146 | %elif (ELF) 147 | ; FN name 148 | %macro FN 1 149 | %1: 150 | _ %+ %1: 151 | %endmacro 152 | 153 | ; FN_EXT name, args, xmmused 154 | %macro FN_EXT 3 155 | %1: 156 | _ %+ %1: 157 | %endmacro 158 | 159 | ; FN_END name 160 | %macro FN_END 1 161 | .size %1, .-%1 162 | .type %1, @function 163 | %endmacro 164 | 165 | ; declares a global is hidden: HIDDEN name 166 | %macro HIDDEN 1 167 | %if (__YASM_VERSION_ID__ >= 09999999h) ; .hidden isn't in yasm yet? 168 | .hidden %1 169 | .hidden _ %+ %1 170 | %endif 171 | %endmacro 172 | 173 | ; set NX for stack 174 | .section .note.GNU-stack,"",@progbits 175 | %elif (MACH) 176 | ; FN name 177 | %macro FN 1 178 | %1: 179 | _ %+ %1: 180 | %endmacro 181 | 182 | ; FN_EXT name, args, xmmused 183 | %macro FN_EXT 3 184 | %1: 185 | _ %+ %1: 186 | %endmacro 187 | 188 | ; FN_END name 189 | %macro FN_END 1 190 | %endmacro 191 | 192 | ; declares a global is hidden: HIDDEN name 193 | %macro HIDDEN 1 194 | %if (__YASM_VERSION_ID__ >= 09999999h) ; .private_extern isn't in yasm yet? 195 | .private_extern %1 196 | .private_extern _ %+ %1 197 | %endif 198 | %endmacro 199 | %endif 200 | 201 | ; put everything in the code segment to simplify things 202 | %define SECTION_TEXT .section .text 203 | %define SECTION_RODATA .section .text 204 | 205 | ; declares a global function: GLOBAL name 206 | %macro GLOBAL 1 207 | .globl %1 208 | .globl _ %+ %1 209 | %endmacro 210 | 211 | %macro FN_LOCAL_PREFIX 1 212 | FN PROJECT_NAME %+ _ %+ %1 213 | %endmacro 214 | 215 | %macro FN_EXT_LOCAL_PREFIX 3 216 | FN_EXT PROJECT_NAME %+ _ %+ %1, %2, %3 217 | %endmacro 218 | 219 | %macro FN_END_LOCAL_PREFIX 1 220 | FN_END PROJECT_NAME %+ _ %+ %1 221 | %endmacro 222 | 223 | %macro GLOBAL_LOCAL_PREFIX 1 224 | GLOBAL PROJECT_NAME %+ _ %+ %1 225 | HIDDEN PROJECT_NAME %+ _ %+ %1 226 | %endmacro 227 | 228 | ; name 229 | %macro GLOBAL_HIDDEN_FN 1 230 | GLOBAL %1 231 | HIDDEN %1 232 | FN %1 233 | %endmacro 234 | 235 | ; name, args, xmmused 236 | %macro GLOBAL_HIDDEN_FN_EXT 3 237 | GLOBAL %1 238 | HIDDEN %1 239 | FN_EXT %1, %2, %3 240 | %endmacro 241 | 242 | 243 | ; pic support: LOAD_VAR_PIC var, reg 244 | %macro LOAD_VAR_PIC 2 245 | %if (BITS32) 246 | call 1f 247 | 1: 248 | popl %2 249 | leal %1 - 1b(%2), %2 250 | %else 251 | leaq %1(%rip), %2 252 | %endif 253 | %endmacro 254 | 255 | %macro INCLUDE 1 256 | %include %1 257 | %endmacro 258 | 259 | ; include the file with the variable(s) if variable 'name' is not already included: INCLUDE_VAR_FILE file, name 260 | %macro INCLUDE_VAR_FILE 2 261 | %ifndef INCLUDED_%2 262 | %define INCLUDED_%2 263 | %include %1 264 | %endif 265 | %endmacro 266 | 267 | ; stupid helpers so we can have cpuid in one file 268 | 269 | %macro CPUID_PROLOGUE 0 270 | %if (BITS32) 271 | pushl %ebx 272 | pushl %esi 273 | pushl %edi 274 | pushl %ebp 275 | 276 | ; check that cpuid is supported 277 | pushfl 278 | popl %eax 279 | movl %eax, %ecx 280 | xorl $0x200000, %eax 281 | pushl %eax 282 | popfl 283 | pushfl 284 | popl %eax 285 | xorl %ecx, %eax 286 | shrl $21, %eax 287 | andl $1, %eax 288 | pushl %ecx 289 | popfl 290 | andl %eax, %eax 291 | jz Lcpuid_x86_done 292 | %else 293 | pushq %rbx 294 | pushq %rsi 295 | pushq %rdi 296 | pushq %rbp 297 | %endif 298 | %endmacro 299 | 300 | %macro CPUID_EPILOGUE 0 301 | %if (BITS32) 302 | popl %ebp 303 | popl %edi 304 | popl %esi 305 | popl %ebx 306 | %else 307 | popq %rbp 308 | popq %rdi 309 | popq %rsi 310 | popq %rbx 311 | %endif 312 | ret 313 | %endmacro 314 | 315 | %macro CPUCYCLES 0 316 | rdtsc 317 | %if (BITS64) 318 | shlq $32, %rdx 319 | orq %rdx, %rax 320 | %endif 321 | ret 322 | %endmacro 323 | 324 | %define CPUID_GENERIC (0 ) 325 | %define CPUID_X86 (1 << 0) 326 | %define CPUID_MMX (1 << 1) 327 | %define CPUID_SSE (1 << 2) 328 | %define CPUID_SSE2 (1 << 3) 329 | %define CPUID_SSE3 (1 << 4) 330 | %define CPUID_SSSE3 (1 << 5) 331 | %define CPUID_SSE4_1 (1 << 6) 332 | %define CPUID_SSE4_2 (1 << 7) 333 | %define CPUID_AVX (1 << 8) 334 | %define CPUID_XOP (1 << 9) 335 | %define CPUID_AVX2 (1 << 10) 336 | %define CPUID_AVX512 (1 << 11) 337 | 338 | %define CPUID_RDTSC (1 << 25) 339 | %define CPUID_RDRAND (1 << 26) 340 | %define CPUID_POPCNT (1 << 27) 341 | %define CPUID_FMA4 (1 << 28) 342 | %define CPUID_FMA3 (1 << 29) 343 | %define CPUID_PCLMULQDQ (1 << 30) 344 | %define CPUID_AES (1 << 31) 345 | 346 | %endif ; BASE_YASM 347 | -------------------------------------------------------------------------------- /framework/driver/yasm_driver.inc: -------------------------------------------------------------------------------- 1 | %ifndef YASM_DRIVER_INC 2 | %define YASM_DRIVER_INC 3 | 4 | %include "asmopt_internal.h" 5 | 6 | %include "x86/yasm.inc" 7 | 8 | %macro INCLUDE_IF_X86_32BIT 1 9 | %if (BITS32) 10 | INCLUDE %1 11 | %endif 12 | %endmacro 13 | 14 | %macro INCLUDE_IF_X86_64BIT 1 15 | %if (BITS64) 16 | INCLUDE %1 17 | %endif 18 | %endmacro 19 | 20 | %macro INCLUDE_IF_MMX_32BIT 1 21 | INCLUDE_IF_X86_32BIT %1 22 | %endmacro 23 | 24 | %macro INCLUDE_IF_MMX_64BIT 1 25 | INCLUDE_IF_X86_64BIT %1 26 | %endmacro 27 | 28 | 29 | %macro INCLUDE_IF_SSE_32BIT 1 30 | INCLUDE_IF_X86_32BIT %1 31 | %endmacro 32 | 33 | %macro INCLUDE_IF_SSE_64BIT 1 34 | INCLUDE_IF_X86_64BIT %1 35 | %endmacro 36 | 37 | 38 | %macro INCLUDE_IF_SSE2_32BIT 1 39 | INCLUDE_IF_X86_32BIT %1 40 | %endmacro 41 | 42 | %macro INCLUDE_IF_SSE2_64BIT 1 43 | INCLUDE_IF_X86_64BIT %1 44 | %endmacro 45 | 46 | 47 | %macro INCLUDE_IF_SSE3_32BIT 1 48 | INCLUDE_IF_X86_32BIT %1 49 | %endmacro 50 | 51 | %macro INCLUDE_IF_SSE3_64BIT 1 52 | INCLUDE_IF_X86_64BIT %1 53 | %endmacro 54 | 55 | 56 | %macro INCLUDE_IF_SSSE3_32BIT 1 57 | INCLUDE_IF_X86_32BIT %1 58 | %endmacro 59 | 60 | %macro INCLUDE_IF_SSSE3_64BIT 1 61 | INCLUDE_IF_X86_64BIT %1 62 | %endmacro 63 | 64 | 65 | %macro INCLUDE_IF_SSE4_1_32BIT 1 66 | INCLUDE_IF_X86_32BIT %1 67 | %endmacro 68 | 69 | %macro INCLUDE_IF_SSE4_1_64BIT 1 70 | INCLUDE_IF_X86_64BIT %1 71 | %endmacro 72 | 73 | 74 | %macro INCLUDE_IF_SSE4_2_32BIT 1 75 | INCLUDE_IF_X86_32BIT %1 76 | %endmacro 77 | 78 | %macro INCLUDE_IF_SSE4_2_64BIT 1 79 | INCLUDE_IF_X86_64BIT %1 80 | %endmacro 81 | 82 | 83 | %macro INCLUDE_IF_AVX_32BIT 1 84 | INCLUDE_IF_X86_32BIT %1 85 | %endmacro 86 | 87 | %macro INCLUDE_IF_AVX_64BIT 1 88 | INCLUDE_IF_X86_64BIT %1 89 | %endmacro 90 | 91 | 92 | %macro INCLUDE_IF_XOP_32BIT 1 93 | %if HAVE_XOP 94 | INCLUDE_IF_X86_32BIT %1 95 | %endif 96 | %endmacro 97 | 98 | %macro INCLUDE_IF_XOP_64BIT 1 99 | %if HAVE_XOP 100 | INCLUDE_IF_X86_64BIT %1 101 | %endif 102 | %endmacro 103 | 104 | 105 | %macro INCLUDE_IF_AVX2_32BIT 1 106 | %if HAVE_AVX2 107 | INCLUDE_IF_X86_32BIT %1 108 | %endif 109 | %endmacro 110 | 111 | %macro INCLUDE_IF_AVX2_64BIT 1 112 | %if HAVE_AVX2 113 | INCLUDE_IF_X86_64BIT %1 114 | %endif 115 | %endmacro 116 | 117 | 118 | %macro INCLUDE_IF_AVX512_32BIT 1 119 | %if HAVE_AVX512 120 | INCLUDE_IF_X86_32BIT %1 121 | %endif 122 | %endmacro 123 | 124 | %macro INCLUDE_IF_AVX512_64BIT 1 125 | %if HAVE_AVX512 126 | INCLUDE_IF_X86_64BIT %1 127 | %endif 128 | %endmacro 129 | 130 | ; include unsupported platform includes here 131 | ; ... 132 | ; ... 133 | ; ... 134 | 135 | %endif ; YASM_DRIVER_INC -------------------------------------------------------------------------------- /framework/fuzz.c: -------------------------------------------------------------------------------- 1 | #if (defined(_WIN32) || defined(_WIN64)) 2 | #include 3 | #include 4 | #endif 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "cpuid.h" 11 | #include "fuzz.h" 12 | 13 | /* 14 | Chacha/8 rng with no addition of state words post-mixing, no security at all, but good 15 | portable random numbers for fuzzing 16 | */ 17 | 18 | #if defined(HAVE_INT32) 19 | typedef uint32_t chacha_int32; 20 | #else 21 | typedef unsigned long chacha_int32; 22 | #endif 23 | 24 | /* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */ 25 | static void 26 | store8(unsigned char *p, chacha_int32 v) { 27 | p[0] = (v ) & 0xff; 28 | p[1] = (v >> 8) & 0xff; 29 | p[2] = (v >> 16) & 0xff; 30 | p[3] = (v >> 24) & 0xff; 31 | } 32 | 33 | /* 32 bit left rotate */ 34 | static chacha_int32 35 | rotate32(chacha_int32 x, int k) { 36 | return ((x << k) | (x >> (32 - k))) & 0xffffffffUL; 37 | } 38 | 39 | typedef struct chacha_state_t { 40 | chacha_int32 s[12]; 41 | } chacha_state_t; 42 | 43 | /* 1 block = 64 bytes */ 44 | static void 45 | chacha_blocks(chacha_state_t *state, unsigned char *out, size_t blocks) { 46 | chacha_int32 x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 47 | chacha_int32 j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15; 48 | chacha_int32 t; 49 | size_t i; 50 | 51 | j4 = state->s[0]; 52 | j5 = state->s[1]; 53 | j6 = state->s[2]; 54 | j7 = state->s[3]; 55 | j8 = state->s[4]; 56 | j9 = state->s[5]; 57 | j10 = state->s[6]; 58 | j11 = state->s[7]; 59 | j12 = state->s[8]; 60 | j13 = state->s[9]; 61 | j14 = state->s[10]; 62 | j15 = state->s[11]; 63 | 64 | for ( ; blocks; blocks -= 1, out += 64) { 65 | /* "expand 32-byte k", as 4 little endian 32-bit unsigned integers */ 66 | x0 = 0x61707865; 67 | x1 = 0x3320646e; 68 | x2 = 0x79622d32; 69 | x3 = 0x6b206574; 70 | x4 = j4; 71 | x5 = j5; 72 | x6 = j6; 73 | x7 = j7; 74 | x8 = j8; 75 | x9 = j9; 76 | x10 = j10; 77 | x11 = j11; 78 | x12 = j12; 79 | x13 = j13; 80 | x14 = j14; 81 | x15 = j15; 82 | 83 | #define quarter(a,b,c,d) \ 84 | a = (a + b) & 0xffffffffUL; t = d^a; d = rotate32(t,16); \ 85 | c = (c + d) & 0xffffffffUL; t = b^c; b = rotate32(t,12); \ 86 | a = (a + b) & 0xffffffffUL; t = d^a; d = rotate32(t, 8); \ 87 | c = (c + d) & 0xffffffffUL; t = b^c; b = rotate32(t, 7); 88 | 89 | for (i = 0; i < 8; i += 2) { 90 | quarter( x0, x4, x8,x12) 91 | quarter( x1, x5, x9,x13) 92 | quarter( x2, x6,x10,x14) 93 | quarter( x3, x7,x11,x15) 94 | quarter( x0, x5,x10,x15) 95 | quarter( x1, x6,x11,x12) 96 | quarter( x2, x7, x8,x13) 97 | quarter( x3, x4, x9,x14) 98 | } 99 | 100 | store8(out + 0, x0); 101 | store8(out + 4, x1); 102 | store8(out + 8, x2); 103 | store8(out + 12, x3); 104 | store8(out + 16, x4); 105 | store8(out + 20, x5); 106 | store8(out + 24, x6); 107 | store8(out + 28, x7); 108 | store8(out + 32, x8); 109 | store8(out + 36, x9); 110 | store8(out + 40, x10); 111 | store8(out + 44, x11); 112 | store8(out + 48, x12); 113 | store8(out + 52, x13); 114 | store8(out + 56, x14); 115 | store8(out + 60, x15); 116 | 117 | /* use counter+iv as a 128 bit counter */ 118 | j12 = (j12 + 1); 119 | if (!j12) { 120 | j13 = (j13 + 1); 121 | if (!j13) { 122 | j14 = (j14 + 1); 123 | if (!j14) 124 | j15 = (j15 + 1); 125 | } 126 | } 127 | } 128 | 129 | state->s[8] = j12; 130 | state->s[9] = j13; 131 | state->s[10] = j14; 132 | state->s[11] = j15; 133 | } 134 | 135 | typedef struct fuzz_state_t { 136 | chacha_state_t rng; 137 | unsigned char buffer[64]; 138 | size_t remaining; 139 | } fuzz_state_t; 140 | 141 | static fuzz_state_t fuzz_state; 142 | 143 | /* reload the fuzz random number buffer */ 144 | static void 145 | fuzz_reload(fuzz_state_t *st) { 146 | chacha_blocks(&st->rng, st->buffer, sizeof(st->buffer) / 64); 147 | st->remaining = sizeof(st->buffer); 148 | } 149 | 150 | /* initialize the state to all zeros */ 151 | void 152 | fuzz_init_deterministic(void) { 153 | memset(&fuzz_state.rng, 0, sizeof(fuzz_state.rng)); 154 | fuzz_reload(&fuzz_state); 155 | } 156 | 157 | /* initialize the state randomly */ 158 | void 159 | fuzz_init(void) { 160 | #if (defined(_WIN32) || defined(_WIN64)) 161 | HCRYPTPROV handle; 162 | if (!CryptAcquireContext(&handle, 0, 0, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)) { 163 | fprintf(stderr, "CryptAcquireContext failed"); 164 | exit(1); 165 | } 166 | CryptGenRandom(handle, sizeof(fuzz_state.rng), (BYTE*)&fuzz_state.rng); 167 | CryptReleaseContext(handle, 0); 168 | #else 169 | FILE *f = fopen("/dev/urandom", "r"); 170 | if (!f) { 171 | fprintf(stderr, "failed to open /dev/urandom"); 172 | exit(1); 173 | } 174 | if (fread(&fuzz_state.rng, sizeof(fuzz_state.rng), 1, f) != 1) { 175 | fprintf(stderr, "read on /dev/urandom failed"); 176 | exit(1); 177 | } 178 | fclose(f); 179 | #endif 180 | fuzz_reload(&fuzz_state); 181 | } 182 | 183 | /* get len random bytes */ 184 | void 185 | fuzz_get_bytes(void *out, size_t len) { 186 | unsigned char *outb = (unsigned char *)out; 187 | 188 | while (len) { 189 | /* drain the stored buffer first */ 190 | if (fuzz_state.remaining) { 191 | size_t bytes = (len > fuzz_state.remaining) ? fuzz_state.remaining : len; 192 | memcpy(outb, fuzz_state.buffer + (sizeof(fuzz_state.buffer) - fuzz_state.remaining), bytes); 193 | 194 | fuzz_state.remaining -= bytes; 195 | outb += bytes; 196 | len -= bytes; 197 | } 198 | 199 | /* fill up with full blocks */ 200 | if (len >= 64) { 201 | size_t bytes = (len & ~63), blocks = len / 64; 202 | chacha_blocks(&fuzz_state.rng, outb, blocks); 203 | outb += bytes; 204 | len -= bytes; 205 | } 206 | 207 | /* refill the stored buffer if needed */ 208 | if (!fuzz_state.remaining) 209 | fuzz_reload(&fuzz_state); 210 | } 211 | } 212 | 213 | /* print len bytes from bytes in hex format, xor'd against base if bytes != base */ 214 | void 215 | fuzz_print_bytes(const char *desc, const unsigned char *bytes, const unsigned char *base, size_t len) { 216 | size_t i; 217 | printf("%s: ", desc); 218 | for (i = 0; i < len; i++) { 219 | if (i && ((i % 16) == 0)) 220 | printf("\n"); 221 | if (base != bytes) { 222 | unsigned char diff = base[i] ^ bytes[i]; 223 | if (diff) 224 | printf("0x%02x,", diff); 225 | else 226 | printf("____,"); 227 | } else { 228 | printf("0x%02x,", bytes[i]); 229 | } 230 | } 231 | printf("\n\n"); 232 | } 233 | 234 | static void 235 | fuzz_print_input(const fuzz_variable_t *input_variables, const size_t *random_sizes, const unsigned char *input) { 236 | size_t random_size; 237 | 238 | for ( ; ; input_variables++) { 239 | switch (input_variables->type) { 240 | case FUZZ_DONE: 241 | return; 242 | 243 | case FUZZ_ARRAY: 244 | fuzz_print_bytes(input_variables->desc, input, input, input_variables->size); 245 | input += input_variables->size; 246 | break; 247 | 248 | case FUZZ_RANDOM_LENGTH_ARRAY0: 249 | case FUZZ_RANDOM_LENGTH_ARRAY1: 250 | case FUZZ_RANDOM_LENGTH_ARRAY2: 251 | case FUZZ_RANDOM_LENGTH_ARRAY3: 252 | random_size = random_sizes[input_variables->type - FUZZ_RANDOM_LENGTH_ARRAY0]; 253 | fuzz_print_bytes(input_variables->desc, input, input, random_size); 254 | input += random_size; 255 | break; 256 | } 257 | } 258 | } 259 | 260 | 261 | static void 262 | fuzz_print_output(const cpu_specific_impl_t *impl, const fuzz_variable_t *output_variables, const size_t *random_sizes, const unsigned char *output, const unsigned char *generic_output) { 263 | size_t random_size; 264 | 265 | printf("IMPLEMENTATION: %s\n", impl->desc); 266 | 267 | for ( ; ; output_variables++) { 268 | switch (output_variables->type) { 269 | case FUZZ_DONE: 270 | return; 271 | 272 | case FUZZ_ARRAY: 273 | fuzz_print_bytes(output_variables->desc, output, generic_output, output_variables->size); 274 | output += output_variables->size; 275 | generic_output += output_variables->size; 276 | break; 277 | 278 | case FUZZ_RANDOM_LENGTH_ARRAY0: 279 | case FUZZ_RANDOM_LENGTH_ARRAY1: 280 | case FUZZ_RANDOM_LENGTH_ARRAY2: 281 | case FUZZ_RANDOM_LENGTH_ARRAY3: 282 | random_size = random_sizes[output_variables->type - FUZZ_RANDOM_LENGTH_ARRAY0]; 283 | fuzz_print_bytes(output_variables->desc, output, generic_output, random_size); 284 | output += random_size; 285 | generic_output += random_size; 286 | break; 287 | } 288 | } 289 | } 290 | 291 | /* run the fuzzer */ 292 | void 293 | fuzz(const void *impls, size_t impl_size, const fuzz_variable_t *input_variables, const fuzz_variable_t *output_variables, impl_fuzz fuzz_fn) { 294 | /* allocate data */ 295 | unsigned char *fuzz_input = NULL, *fuzz_output = NULL; 296 | const cpu_specific_impl_t **impl_list_alloc = (const cpu_specific_impl_t **)malloc(sizeof(const cpu_specific_impl_t *) * 32), **impl_list; 297 | size_t impl_count = 0; 298 | size_t random_sizes[4], *random_size; 299 | 300 | /* cpu detection */ 301 | unsigned long cpu_flags = LOCAL_PREFIX(cpuid)(); 302 | const char *p = (const char *)impls; 303 | 304 | size_t expected_bytes_out; 305 | unsigned char *outp; 306 | size_t i; 307 | 308 | /* counter display */ 309 | clock_t start, clocks; 310 | size_t counter, counter_dot, counter_line; 311 | int display_counter; 312 | 313 | /* aggregate number of implementations, storing them in reverse order (generic first, most optimized last) */ 314 | impl_list = &impl_list_alloc[31]; 315 | for (;;) { 316 | const cpu_specific_impl_t *impl = (const cpu_specific_impl_t *)p; 317 | if (impl->cpu_flags == (impl->cpu_flags & cpu_flags)) 318 | *(impl_list--) = (const cpu_specific_impl_t *)impl; 319 | if (impl->cpu_flags == CPUID_GENERIC) 320 | break; 321 | p += impl_size; 322 | } 323 | 324 | /* need at least 2 added to do anything interesting */ 325 | impl_count = (&impl_list_alloc[31] - impl_list); 326 | if (impl_count <= 1) { 327 | printf("not enough implementations to fuzz..\n"); 328 | goto done; 329 | } 330 | /* point it at the last impl added */ 331 | impl_list += 1; 332 | 333 | /* 16k for raw data, 1k for key material and derived data */ 334 | fuzz_input = (unsigned char *)malloc(16384 + 1024); 335 | fuzz_output = (unsigned char *)malloc((16384 + 1024) * impl_count); 336 | 337 | /* show list of implementations being fuzzed */ 338 | printf("fuzzing %s", impl_list[0]->desc); 339 | for (i = 1; i < impl_count; i++) { 340 | printf(", %s", impl_list[i]->desc); 341 | } 342 | printf("\n\n"); 343 | 344 | /* fuzz loop */ 345 | display_counter = 0; 346 | counter = 0; 347 | counter_dot = 0; 348 | counter_line = 0; 349 | 350 | start = clock(); 351 | for (;;) { 352 | unsigned char *inp = fuzz_input; 353 | unsigned char *generic_out = fuzz_output; 354 | 355 | /* set up the data for this run */ 356 | for (i = 0; input_variables[i].type != FUZZ_DONE; i++) { 357 | switch (input_variables[i].type) { 358 | case FUZZ_DONE: 359 | break; 360 | 361 | case FUZZ_ARRAY: 362 | fuzz_get_bytes(inp, input_variables[i].size); 363 | inp += input_variables[i].size; 364 | break; 365 | 366 | case FUZZ_RANDOM_LENGTH_ARRAY0: 367 | case FUZZ_RANDOM_LENGTH_ARRAY1: 368 | case FUZZ_RANDOM_LENGTH_ARRAY2: 369 | case FUZZ_RANDOM_LENGTH_ARRAY3: 370 | random_size = &random_sizes[input_variables[i].type - FUZZ_RANDOM_LENGTH_ARRAY0]; 371 | fuzz_get_bytes(random_size, sizeof(*random_size)); 372 | *random_size = (*random_size % input_variables[i].size); 373 | fuzz_get_bytes(inp, *random_size); 374 | inp += *random_size; 375 | break; 376 | } 377 | } 378 | 379 | expected_bytes_out = 0; 380 | for (i = 0; output_variables[i].type != FUZZ_DONE; i++) { 381 | switch (output_variables[i].type) { 382 | case FUZZ_DONE: 383 | break; 384 | 385 | case FUZZ_ARRAY: 386 | expected_bytes_out += output_variables[i].size; 387 | break; 388 | 389 | case FUZZ_RANDOM_LENGTH_ARRAY0: 390 | case FUZZ_RANDOM_LENGTH_ARRAY1: 391 | case FUZZ_RANDOM_LENGTH_ARRAY2: 392 | case FUZZ_RANDOM_LENGTH_ARRAY3: 393 | random_size = &random_sizes[output_variables[i].type - FUZZ_RANDOM_LENGTH_ARRAY0]; 394 | expected_bytes_out += *random_size; 395 | break; 396 | } 397 | } 398 | 399 | /* gather results */ 400 | outp = fuzz_output; 401 | for (i = 0; i < impl_count; i++) { 402 | fuzz_fn(impl_list[i], fuzz_input, random_sizes, outp); 403 | outp += expected_bytes_out; 404 | } 405 | 406 | /* compare results */ 407 | outp = fuzz_output + expected_bytes_out; 408 | for (i = 1; i < impl_count; i++) { 409 | if (memcmp(generic_out, outp, expected_bytes_out) != 0) 410 | goto failure; 411 | outp += expected_bytes_out; 412 | } 413 | 414 | counter++; 415 | 416 | /* are we still calibrating? */ 417 | if (!display_counter) { 418 | clocks = clock(); 419 | if (clocks == (clock_t)-1) { 420 | /* clock is broken, use values which might suck.. */ 421 | counter_line = 8192; 422 | counter_dot = (counter_line / 32); 423 | counter = 0; 424 | display_counter = 1; 425 | } else if ((clocks - start) >= CLOCKS_PER_SEC) { 426 | printf("doing approximately %u passes a second..\n", (unsigned int)(counter)); 427 | 428 | /* 32 dots per line, 1 line per ~5 seconds */ 429 | counter_line = 1; 430 | counter *= 5; 431 | while (counter_line < counter) 432 | counter_line *= 2; 433 | if (counter_line < 32) 434 | counter_line = 32; 435 | counter_dot = (counter_line / 32); 436 | if (counter_dot < 1) 437 | counter_dot = 1; 438 | 439 | counter = 0; 440 | display_counter = 1; 441 | } 442 | } else { 443 | if ((counter & (counter_dot - 1)) == 0) 444 | printf("."); 445 | if ((counter & (counter_line - 1)) == 0) 446 | printf("[%08x]\n", (unsigned int)(counter)); 447 | } 448 | } 449 | 450 | failure: 451 | printf("fuzz mismatch! dumping input and output data\n\n"); 452 | 453 | printf("INPUT\n\n"); 454 | fuzz_print_input(input_variables, random_sizes, fuzz_input); 455 | 456 | printf("OUTPUT\n\n"); 457 | outp = fuzz_output; 458 | fuzz_print_output(impl_list[0], output_variables, random_sizes, outp, fuzz_output); 459 | outp += expected_bytes_out; 460 | 461 | for (i = 1; i < impl_count; i++) { 462 | fuzz_print_output(impl_list[i], output_variables, random_sizes, outp, fuzz_output); 463 | outp += expected_bytes_out; 464 | } 465 | 466 | done: 467 | if (fuzz_input) 468 | free(fuzz_input); 469 | if (fuzz_output) 470 | free(fuzz_output); 471 | free((void *)impl_list_alloc); 472 | } 473 | -------------------------------------------------------------------------------- /framework/include/bench.h: -------------------------------------------------------------------------------- 1 | #ifndef BENCH_H 2 | #define BENCH_H 3 | 4 | #include "asmopt_internal.h" 5 | #include "cpuid.h" 6 | 7 | typedef void (*impl_bench)(const void *impl); 8 | 9 | /* a 32k, 64 byte aligned buffer to bench with */ 10 | unsigned char *bench_get_buffer(void); 11 | 12 | int bench(const void *impls, size_t impl_size, impl_test test_fn, impl_bench bench_fn, size_t units_count, const char *units_desc); 13 | 14 | #endif /* BENCH_H */ 15 | 16 | -------------------------------------------------------------------------------- /framework/include/cpucycles.h: -------------------------------------------------------------------------------- 1 | #ifndef CPUCYCLES_H 2 | #define CPUCYCLES_H 3 | 4 | #include "asmopt_internal.h" 5 | 6 | #if defined(HAVE_INT64) 7 | typedef uint64_t cycles_t; 8 | #elif defined(HAVE_INT32) 9 | typedef uint32_t cycles_t; 10 | #else 11 | typedef unsigned long cycles_t; 12 | #endif 13 | 14 | cycles_t LOCAL_PREFIX(cpucycles)(void); 15 | const char *LOCAL_PREFIX(cpucycles_units)(void); 16 | 17 | #endif /* CPUCYCLES_H */ 18 | 19 | -------------------------------------------------------------------------------- /framework/include/cpuid.h: -------------------------------------------------------------------------------- 1 | #ifndef CPUID_H 2 | #define CPUID_H 3 | 4 | #include "asmopt_internal.h" 5 | 6 | #if defined(__cplusplus) 7 | extern "C" { 8 | #endif 9 | 10 | enum cpuid_flags_generic_t { 11 | CPUID_GENERIC = (0) 12 | }; 13 | 14 | #include "cpuid_flags.inc" 15 | 16 | unsigned long LOCAL_PREFIX(cpuid)(void); 17 | 18 | /* runtime dispatching based on current cpu */ 19 | typedef struct cpu_specific_impl_t { 20 | unsigned long cpu_flags; 21 | const char *desc; 22 | /* additional information, pointers to methods, etc... */ 23 | } cpu_specific_impl_t; 24 | 25 | typedef int (*impl_test)(const void *impl); 26 | 27 | const void *LOCAL_PREFIX(cpu_select)(const void *impls, size_t impl_size, impl_test test_fn); 28 | 29 | #if defined(__cplusplus) 30 | } 31 | #endif 32 | 33 | #endif /* CPUID_H */ 34 | -------------------------------------------------------------------------------- /framework/include/fuzz.h: -------------------------------------------------------------------------------- 1 | #ifndef FUZZ_H 2 | #define FUZZ_H 3 | 4 | #include "asmopt_internal.h" 5 | 6 | #if defined(__cplusplus) 7 | extern "C" { 8 | #endif 9 | 10 | typedef void (*impl_fuzz)(const void *impl, const unsigned char *in, const size_t *random_sizes, unsigned char *out); 11 | 12 | typedef enum { 13 | FUZZ_DONE, 14 | FUZZ_ARRAY, 15 | FUZZ_RANDOM_LENGTH_ARRAY0, 16 | FUZZ_RANDOM_LENGTH_ARRAY1, 17 | FUZZ_RANDOM_LENGTH_ARRAY2, 18 | FUZZ_RANDOM_LENGTH_ARRAY3 19 | } fuzz_type_t; 20 | 21 | typedef struct fuzz_variable_t { 22 | const char *desc; 23 | fuzz_type_t type; 24 | size_t size; 25 | } fuzz_variable_t; 26 | 27 | void fuzz_init(void); 28 | void fuzz_init_deterministic(void); 29 | void fuzz_get_bytes(void *out, size_t len); 30 | void fuzz_print_bytes(const char *desc, const unsigned char *bytes, const unsigned char *base, size_t len); 31 | void fuzz(const void *impls, size_t impl_size, const fuzz_variable_t *input_variables, const fuzz_variable_t *output_variables, impl_fuzz fuzz_fn); 32 | 33 | #if defined(__cplusplus) 34 | } 35 | #endif 36 | 37 | #endif /* FUZZ_H */ 38 | -------------------------------------------------------------------------------- /framework/main_shared.c: -------------------------------------------------------------------------------- 1 | #include "asmopt_internal.h" 2 | 3 | #if defined(_WIN32) || defined(__CYGWIN__) 4 | 5 | #include 6 | 7 | BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) { 8 | hinstDLL; 9 | lpvReserved; 10 | 11 | switch (fdwReason) { 12 | case DLL_PROCESS_ATTACH: 13 | break; 14 | 15 | case DLL_THREAD_ATTACH: 16 | break; 17 | 18 | case DLL_THREAD_DETACH: 19 | break; 20 | 21 | case DLL_PROCESS_DETACH: 22 | break; 23 | } 24 | 25 | return TRUE; 26 | } 27 | 28 | #endif /* defined(_WIN32) || defined(__CYGWIN__) */ 29 | -------------------------------------------------------------------------------- /framework/main_util.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* includes, and implementations, implementations_count */ 5 | typedef struct implementation_t { 6 | const char *name; 7 | int (*startup)(void); 8 | void (*fuzz)(void); 9 | void (*bench)(void); 10 | } implementation_t; 11 | 12 | #define make_impl(name) {#name, name##_startup, name##_fuzz, name##_bench} 13 | 14 | #include "util_implementations.h" 15 | 16 | static size_t implementations_count = (sizeof(implementations) / sizeof(implementation_t)); 17 | 18 | static int 19 | help(void) { 20 | if (implementations_count > 1) { 21 | size_t i; 22 | printf("usage: util ["); 23 | for (i = 0; i < implementations_count; i++) { 24 | printf("%s", implementations[i].name); 25 | if (i < (implementations_count - 1)) 26 | printf(","); 27 | } 28 | printf("] [fuzz,bench]\n\n"); 29 | } else { 30 | printf("usage: util [fuzz,bench]\n\n"); 31 | } 32 | return 1; 33 | } 34 | 35 | int main(int argc, const char *argv[]) { 36 | const implementation_t *sel = implementations, *end = sel + implementations_count; 37 | size_t action_arg = 1; 38 | 39 | if (implementations_count == 0) { 40 | printf("no implementations available\n"); 41 | return 1; 42 | } 43 | 44 | if (argc < ((implementations_count > 1) ? 3 : 2)) 45 | return help(); 46 | 47 | if (implementations_count > 1) { 48 | while (sel < end) { 49 | if (strcmp(argv[1], sel->name) == 0) 50 | break; 51 | sel++; 52 | } 53 | 54 | if (sel == end) 55 | return help(); 56 | 57 | action_arg = 2; 58 | } 59 | 60 | if (sel->startup() != 0) { 61 | printf("%s failed to startup\n", sel->name); 62 | return 1; 63 | } 64 | 65 | if (strcmp(argv[action_arg], "fuzz") == 0) 66 | sel->fuzz(); 67 | else if (strcmp(argv[action_arg], "bench") == 0) 68 | sel->bench(); 69 | else 70 | return help(); 71 | 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /genvs.php: -------------------------------------------------------------------------------- 1 | 1, "asmopt_internal.h"=>1, "util_implementations.h"=>1); 43 | 44 | function crawl(&$list, $dir, $grab, $recurse) { 45 | global $crawl_ignore; 46 | $dh = opendir($dir); 47 | if ($dh) { 48 | while (($file = readdir($dh)) !== false) { 49 | $path = $dir."/".$file; 50 | if (($file == ".") || ($file == "..") || isset($crawl_ignore[$file])) 51 | continue; 52 | if (is_dir($path)) { 53 | if ($recurse) 54 | crawl($list, $path, $grab, $recurse); 55 | } else { 56 | foreach($grab as $pat) { 57 | if (preg_match($pat, $file)) { 58 | $list[] = fixslash($path); 59 | break; 60 | } 61 | } 62 | } 63 | } 64 | closedir($dh); 65 | } 66 | } 67 | 68 | abstract class gen_vs { 69 | protected $name; 70 | protected $builds; 71 | protected $projects; 72 | protected $sln; 73 | protected $project_dir; 74 | protected $files; 75 | protected $include_dirs; 76 | 77 | public function gen_vs($name) { 78 | $this->name = strtolower($name); 79 | $this->projects = array(); 80 | 81 | foreach(array("lib", "dll", "util") as $type) { 82 | $name = "{$this->name}_{$type}"; 83 | $this->projects[$type] = array("name"=>$name, "guid"=>get_guid($name)); 84 | } 85 | 86 | $this->include_dirs = array("./", "../app/include", "../app/extensions", "../framework/include", "../framework/driver", "../framework/driver/x86"); 87 | } 88 | 89 | public function build_files() { 90 | $this->files = array("driver"=>array(), "ext"=>array(), "util"=>array(), "shared"=>array(), "include"=>array()); 91 | crawl($this->files["driver"], "framework/driver", array("!\.c$!", "!\.h$!", "!\.inc$!"), false); 92 | crawl($this->files["driver"], "framework/driver/x86", array("!\.c$!", "!\.S$!", "!\.h$!", "!\.inc$!"), false); 93 | crawl($this->files["ext"], "app/extensions", array("!\.c$!", "!\.S$!", "!\.inc$!", "!\.h$!"), true); 94 | crawl($this->files["include"], "app/include", array("!\.h$!"), false); 95 | crawl($this->files["include"], "framework/include", array("!\.h$!"), false); 96 | crawl($this->files["shared"], "framework", array("!main_shared\.c$!"), false); 97 | crawl($this->files["util"], "framework", array("!main_util\.c$!", "!fuzz\.c$!", "!bench\.c$!"), true); 98 | 99 | $this->projects["lib"]["files"] = array("driver", "ext", "include"); 100 | $this->projects["dll"]["files"] = array("driver", "ext", "include", "shared"); 101 | $this->projects["util"]["files"] = array("driver", "ext", "include", "util"); 102 | } 103 | 104 | public function write_file($name, $str) { 105 | $in = array("%%name", "%%NAME", "%%projectdir"); 106 | $out = array($this->name, strtoupper($this->name), $this->project_dir); 107 | $name = str_replace($in, $out, $name); 108 | $str = str_replace($in, $out, $str); 109 | $f = fopen("{$name}", "w+"); 110 | chmod("{$name}", 0755); 111 | fwrite($f, $str); 112 | fclose($f); 113 | } 114 | 115 | public abstract function make(); 116 | }; 117 | 118 | 119 | /* 120 | vs 2010 'tricks' 121 | 122 | allow a files with the same name, but different paths, to be compiled correctly and not in to a flat directory: set 123 | ObjectFileName path to "$(IntDir)dummy\\%(RelativeDir)/", dummy eats the ../ we used to escape the vs2010 dir. 124 | 125 | 126 | */ 127 | 128 | class vs2010 extends gen_vs { 129 | protected $fileinfo; 130 | 131 | protected $toolset; 132 | protected $toolsversion; 133 | protected $fileformatversion; 134 | protected $vsversion; 135 | 136 | public function vs2010($name) { 137 | parent::gen_vs($name); 138 | 139 | $this->sln = "{$this->name}.sln"; 140 | 141 | foreach($this->projects as $handle=>&$info) 142 | $info["vcxproj"] = "{$info['name']}.vcxproj"; 143 | 144 | $this->builds = array( 145 | "Debug|x86-32bit"=>"Debug|Win32", 146 | "Debug|amd64"=>"Debug|x64", 147 | "Release|x86-32bit"=>"Release|Win32", 148 | "Release|amd64"=>"Release|x64" 149 | ); 150 | 151 | $this->project_dir = "vs2010"; 152 | $this->toolset = "v100"; 153 | $this->fileformatversion = "11.00"; 154 | $this->vsversion = "# Visual Studio 2010"; 155 | $this->toolsversion = "4.0"; 156 | } 157 | 158 | function make_sln() { 159 | $f = fopen("{$this->project_dir}/".$this->sln, "w+"); 160 | fecho($f, 161 | addln("Microsoft Visual Studio Solution File, Format Version {$this->fileformatversion}"). 162 | addln("{$this->vsversion}") 163 | ); 164 | 165 | foreach($this->projects as $handle=>$info) { 166 | fecho($f, 167 | addln("Project(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = ".quote($info["name"]).", ".quote($info["vcxproj"]).", ".quote($info["guid"])). 168 | addln("EndProject") 169 | ); 170 | } 171 | 172 | fecholn($f, "Global"); 173 | fecholn($f, " GlobalSection(SolutionConfigurationPlatforms) = preSolution"); 174 | foreach($this->builds as $label=>$build) 175 | fecholn($f, " {$label} = {$label}"); 176 | fecholn($f, " EndGlobalSection"); 177 | 178 | fecholn($f, " GlobalSection(ProjectConfigurationPlatforms) = postSolution"); 179 | foreach($this->projects as $handle=>$info) { 180 | foreach($this->builds as $label=>$build) { 181 | fecho($f, 182 | addln(" {$info['guid']}.{$label}.ActiveCfg = {$build}"). 183 | addln(" {$info['guid']}.{$label}.Build.0 = {$build}") 184 | ); 185 | } 186 | } 187 | fecholn($f, " EndGlobalSection"); 188 | 189 | fecho($f, 190 | addln(" GlobalSection(SolutionProperties) = preSolution"). 191 | addln(" HideSolutionNode = FALSE"). 192 | addln(" EndGlobalSection") 193 | ); 194 | fecholn($f, "EndGlobal"); 195 | fclose($f); 196 | } 197 | 198 | public function make_vcxproj_filters() { 199 | foreach($this->projects as $handle=>$info) { 200 | $f = fopen("{$this->project_dir}/".$info["vcxproj"].".filters", "w+"); 201 | 202 | fecholn($f, 203 | "". 204 | "" 205 | ); 206 | 207 | /* list of filters we'll be using */ 208 | fecho($f, 209 | "". 210 | "" 211 | ); 212 | 213 | $seen = array(); 214 | foreach($info["files"] as $handle) { 215 | foreach($this->files[$handle] as $path) { 216 | while (1) { 217 | $chop_directory = preg_replace("!^(.*)\\\\.*$!", "$1", $path); 218 | if ($chop_directory === $path) 219 | break; 220 | $seen[$chop_directory] = 1; 221 | $path = $chop_directory; 222 | } 223 | } 224 | } 225 | 226 | foreach($seen as $basepath=>$dummy) 227 | fecho($f, ""); 228 | fecholn($f, ""); 229 | /* list of filters we'll be using */ 230 | 231 | /* list of files with their filters */ 232 | foreach($info["files"] as $handle) { 233 | fecho($f, ""); 234 | foreach($this->files[$handle] as $path) { 235 | $type = $this->fileinfo[$path]["type"]; 236 | $folder = $this->fileinfo[$path]["basepath"]; 237 | fecho($f, "<{$type} Include=\"..\\{$path}\">Source\\{$folder}"); 238 | } 239 | fecholn($f, ""); 240 | } 241 | /* list of files with their filters */ 242 | 243 | fecholn($f, ""); 244 | 245 | fclose($f); 246 | } 247 | } 248 | 249 | public function make_vcxproj() { 250 | foreach($this->projects as $handle=>$info) { 251 | $f = fopen("{$this->project_dir}/".$info["vcxproj"], "w+"); 252 | 253 | fecholn($f, 254 | "". 255 | "" 256 | ); 257 | 258 | /* build configurations */ 259 | fecholn($f, ""); 260 | foreach($this->builds as $build) { 261 | $fields = explode("|", $build); 262 | fecholn($f, 263 | "". 264 | "{$fields[0]}". 265 | "{$fields[1]}". 266 | "" 267 | ); 268 | } 269 | fecholn($f, ""); 270 | /* build configurations */ 271 | 272 | 273 | /* properties for this project */ 274 | fecholn($f, 275 | "". 276 | "{$info['guid']}". 277 | "Win32Proj". 278 | "{$this->name}". 279 | "{$this->toolset}". 280 | "" 281 | ); 282 | 283 | /* some project configuration options */ 284 | fecholn($f, ""); 285 | foreach($this->builds as $build) { 286 | $fields = explode("|", $build); 287 | $configurationmap = array("lib"=>"StaticLibrary", "dll"=>"DynamicLibrary", "util"=>"Application"); 288 | $debuglibmap = array("Release"=>"false", "Debug"=>"true"); 289 | fecholn($f, 290 | "". 291 | "{$configurationmap[$handle]}". 292 | "MultiByte". 293 | "{$debuglibmap[$fields[0]]}". 294 | "" 295 | ); 296 | } 297 | /* some project configuration options */ 298 | 299 | fecholn($f, ""); 300 | 301 | fecholn($f, 302 | "". 303 | "". 304 | "" 305 | ); 306 | 307 | fecholn($f, ""); 308 | 309 | /* target and directories */ 310 | foreach($this->builds as $label=>$build) { 311 | $fields = explode("|", $label); 312 | $target_name = $this->name; 313 | $target_ext = ($handle == "util") ? "exe" : $handle; 314 | fecholn($f, 315 | "". 316 | "$(SolutionDir)..\\bin\\{$fields[0]}\\{$fields[1]}\\". 317 | "$(SolutionDir)..\\build\\{$handle}\\{$fields[0]}\\{$fields[1]}\\". 318 | "{$target_name}". 319 | ".{$target_ext}". 320 | "" 321 | ); 322 | } 323 | /* target and directories */ 324 | 325 | 326 | /* compiler and linker */ 327 | $settingsmap = array( 328 | "Optimization"=>array("Release"=>"MaxSpeed", "Debug"=>"Disabled"), 329 | "IntrinsicFunctions"=>array("Release"=>"true", "Debug"=>"false"), 330 | "InlineFunctionExpansion"=>array("Release"=>"AnySuitable", "Debug"=>"Disabled"), 331 | "FavorSizeOrSpeed"=>array("Release"=>"Speed", "Debug"=>"Neither"), 332 | "BufferSecurityCheck"=>array("Release"=>"false", "Debug"=>"true"), 333 | "EnableCOMDATFolding"=>array("Release"=>"true", "Debug"=>"false"), 334 | "OptimizeReferences"=>array("Release"=>"true", "Debug"=>"false"), 335 | "SubSystem"=>array("lib"=>"Windows", "dll"=>"Windows", "util"=>"Console"), 336 | "PreprocessorDefinitions"=>array("lib"=>"", "dll"=>"BUILDING_DLL;LIB_PUBLIC=__declspec(dllexport)", "util"=>"UTILITIES"), 337 | ); 338 | 339 | $includes = ""; 340 | foreach($this->include_dirs as $dir) 341 | $includes .= str_replace("/", "\\", $dir).";"; 342 | 343 | foreach($this->builds as $build) { 344 | $fields = explode("|", $build); 345 | fecholn($f, ""); 346 | /* compiler */ 347 | fecholn($f, 348 | "". 349 | /* static options */ 350 | "". 351 | "Level4". 352 | "false". 353 | "{$includes}". 354 | "$(IntDir)dummy\\%(RelativeDir)/". 355 | /* custom options */ 356 | "{$settingsmap['BufferSecurityCheck'][$fields[0]]}". 357 | "{$settingsmap['Optimization'][$fields[0]]}". 358 | "{$settingsmap['IntrinsicFunctions'][$fields[0]]}". 359 | "{$settingsmap['InlineFunctionExpansion'][$fields[0]]}". 360 | "{$settingsmap['FavorSizeOrSpeed'][$fields[0]]}". 361 | "{$settingsmap['BufferSecurityCheck'][$fields[0]]}". 362 | "{$settingsmap['PreprocessorDefinitions'][$handle]};%(PreprocessorDefinitions)". 363 | "" 364 | ); 365 | /* linker */ 366 | 367 | switch ($handle) { 368 | case "lib": 369 | fecholn($f, 370 | "". 371 | "false". 372 | "" 373 | ); 374 | break; 375 | 376 | case "dll": 377 | case "util": 378 | fecholn($f, 379 | "". 380 | "true". 381 | "{$settingsmap['SubSystem'][$handle]}". 382 | "{$settingsmap['EnableCOMDATFolding'][$fields[0]]}". 383 | "{$settingsmap['OptimizeReferences'][$fields[0]]}". 384 | "$(OutDir){$this->name}.dll.lib". 385 | "$(TargetDir)$(TargetName)$(TargetExt).pdb". 386 | "" 387 | ); 388 | break; 389 | } 390 | fecholn($f, ""); 391 | } 392 | fecholn($f, ""); 393 | /* compiler and linker */ 394 | 395 | /* list of files */ 396 | $yasm_includes = ""; 397 | foreach($this->include_dirs as $dir) 398 | $yasm_includes .= "-I{$dir} "; 399 | 400 | foreach($info["files"] as $handle) { 401 | fecholn($f, ""); 402 | foreach($this->files[$handle] as $path) { 403 | $type = $this->fileinfo[$path]["type"]; 404 | $folder = $this->fileinfo[$path]["basepath"]; 405 | $cleanpath = str_replace("../", "", $path); 406 | $basename = preg_replace("!(.*)\..*$!", "$1", $this->fileinfo[$path]["basename"]); 407 | if ($type == "CustomBuild") { 408 | fecholn($f, 409 | "<{$type} Include=\"..\\{$path}\">". 410 | "yasm [{$cleanpath}]". 411 | "yasm -r nasm -p gas {$yasm_includes} -o $(IntDir)\\{$folder}\\{$basename}.obj -f win32 ..\\{$path}". 412 | "yasm -r nasm -p gas {$yasm_includes} -o $(IntDir)\\{$folder}\\{$basename}.obj -f win64 ..\\{$path}". 413 | "$(IntDir)\\{$folder}\\{$basename}.obj". 414 | "" 415 | ); 416 | } else { 417 | fecholn($f, "<{$type} Include=\"..\\{$path}\">"); 418 | } 419 | } 420 | fecholn($f, ""); 421 | } 422 | /* list of files */ 423 | 424 | fecholn($f, ""); 425 | 426 | fclose($f); 427 | } 428 | } 429 | 430 | public function make_project() { 431 | $this->build_files(); 432 | 433 | $this->fileinfo = array(); 434 | foreach($this->files as $handle=>$list) { 435 | foreach($list as $path) { 436 | $basepath = preg_replace("!^(.*)\\\\.*$!", "$1", $path); 437 | $basename = preg_replace("!^.*\\\\(.*)$!", "$1", $path); 438 | $this->fileinfo[$path]["basepath"] = $basepath; 439 | $this->fileinfo[$path]["basename"] = $basename; 440 | 441 | $ext = preg_replace("!^.*\.(.*)$!", "$1", $path); 442 | switch ($ext) { 443 | case "c": $type = "ClCompile"; break; 444 | case "S": $type = "CustomBuild"; break; 445 | case "inc": $type = "ClHeader"; break; 446 | case "h": $type = "ClHeader"; break; 447 | } 448 | $this->fileinfo[$path]["type"] = $type; 449 | } 450 | } 451 | 452 | $this->make_vcxproj(); 453 | $this->make_vcxproj_filters(); 454 | } 455 | 456 | public function make() { 457 | if (!file_exists($this->project_dir)) 458 | mkdir($this->project_dir, 0755); 459 | 460 | $this->make_sln(); 461 | $this->make_project(); 462 | } 463 | } 464 | 465 | class vs2012 extends vs2010 { 466 | public function vs2012($name) { 467 | parent::vs2010($name); 468 | 469 | $this->project_dir = "vs2012"; 470 | $this->toolset = "v110"; 471 | $this->fileformatversion = "12.00"; 472 | $this->vsversion = "# Visual Studio 2012"; 473 | } 474 | } 475 | 476 | class vs2013 extends vs2012 { 477 | public function vs2013($name) { 478 | parent::vs2012($name); 479 | 480 | $this->project_dir = "vs2013"; 481 | $this->toolset = "v120"; 482 | $this->fileformatversion = "12.00"; 483 | $this->vsversion = "# Visual Studio 2013"; 484 | $this->toolsversion = "12.0"; 485 | } 486 | } 487 | 488 | 489 | class argument { 490 | var $set, $value; 491 | } 492 | 493 | 494 | class anyargument extends argument { 495 | function anyargument($flag) { 496 | global $argc, $argv; 497 | 498 | $this->set = false; 499 | 500 | for ($i = 1; $i < $argc; $i++) { 501 | if (!preg_match("!--".$flag."=(.*)!", $argv[$i], $m)) 502 | continue; 503 | $this->value = $m[1]; 504 | $this->set = true; 505 | return; 506 | } 507 | } 508 | } 509 | 510 | /* prefix an argument with a * to indicate default */ 511 | class multiargument extends anyargument { 512 | function multiargument($flag, $legal_values) { 513 | parent::anyargument($flag); 514 | 515 | $map = array(); 516 | $default = false; 517 | foreach($legal_values as $value) { 518 | if (substr($value, 0, 1) == "*") 519 | $default = substr($value, 1); 520 | $map[$value] = true; 521 | } 522 | 523 | if (!$this->set) { 524 | if ($default === false) { 525 | usage("value not specified for --{$flag}!"); 526 | exit(1); 527 | } 528 | $this->value = $default; 529 | return; 530 | } 531 | 532 | if (!isset($map[$this->value])) { 533 | usage("{$this->value} is not a valid parameter to --{$flag}!"); 534 | exit(1); 535 | } 536 | } 537 | } 538 | 539 | 540 | class flag extends argument { 541 | function flag($flag) { 542 | global $argc, $argv; 543 | 544 | $this->set = false; 545 | 546 | $flag = "--{$flag}"; 547 | for ($i = 1; $i < $argc; $i++) { 548 | if ($argv[$i] !== $flag) 549 | continue; 550 | $this->value = true; 551 | $this->set = true; 552 | return; 553 | } 554 | } 555 | } 556 | 557 | function usage($reason = "") { 558 | echoln("Usage: php genvs.php [flags]"); 559 | echoln("Flags in parantheses are optional"); 560 | echoln(""); 561 | echoln(" --version=[vs2013,vs2012,vs2010] which project type to generate"); 562 | echoln(" (--disable-yasm) do not use yasm"); 563 | echoln(""); 564 | if ($reason) 565 | echoln($reason); 566 | } 567 | 568 | $help = new flag("help"); 569 | $disable_yasm = new flag("disable-yasm"); 570 | $version = new multiargument("version", array("vs2010", "vs2012", "vs2013")); 571 | 572 | 573 | if ($help->set) { 574 | usage(); 575 | exit(0); 576 | } 577 | 578 | $project_name = trim(my_file_get_contents("app/project.def")); 579 | 580 | switch ($version->value) { 581 | case "vs2010": $sln = new vs2010($project_name); break; 582 | case "vs2012": $sln = new vs2012($project_name); break; 583 | case "vs2013": $sln = new vs2013($project_name); break; 584 | } 585 | 586 | $sln->make(); 587 | 588 | 589 | /* build framework/include/asmopt.h and framework/include/asmopt_internal.h */ 590 | 591 | if ($disable_yasm->set) { 592 | $yasm = ""; 593 | } else { 594 | $yasm = << 616 | 617 | {$yasm} 618 | 619 | #if (defined(_M_IX86)) 620 | #define CPU_32BITS 621 | #elif (defined(_M_X64)) 622 | #define CPU_64BITS 623 | #else 624 | #error This should never happen 625 | #endif 626 | 627 | #define HAVE_INT64 628 | #define HAVE_INT32 629 | #define HAVE_INT16 630 | #define HAVE_INT8 631 | 632 | #if (_MSC_VER < 1300) 633 | typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t; 634 | typedef signed int int32_t; typedef unsigned int uint32_t; 635 | typedef signed short int16_t; typedef unsigned short uint16_t; 636 | typedef signed char int8_t; typedef unsigned char uint8_t; 637 | #elif (_MSC_VER < 1600) 638 | typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t; 639 | typedef signed __int32 int32_t; typedef unsigned __int32 uint32_t; 640 | typedef signed __int16 int16_t; typedef unsigned __int16 uint16_t; 641 | typedef signed __int8 int8_t; typedef unsigned __int8 uint8_t; 642 | #else 643 | #include 644 | #endif 645 | 646 | #endif /* ASMOPT_H */ 647 | 648 | 649 | EOS; 650 | 651 | 652 | $asmopt_internal = <<write_file("%%projectdir/asmopt.h", $asmopt_h); 673 | $sln->write_file("%%projectdir/asmopt_internal.h", $asmopt_internal); 674 | 675 | 676 | 677 | /* build framework/include/util_implemntations.h */ 678 | 679 | $impls = array(); 680 | crawl($impls, "app/include", array("!\.h$!"), false); 681 | 682 | $impl_includes = ""; 683 | $impl_declares = ""; 684 | for ($i = 0; $i < count($impls); $i++) { 685 | $path = $impls[$i]; 686 | $basename = preg_replace("!^.*\\\\(.*)\.h$!", "$1", $path); 687 | $impl_includes .= addln("#include \"{$basename}.h\""); 688 | $impl_declares .= ($i < (count($impls) - 1)) ? addln("\tmake_impl({$basename}),") : "\tmake_impl({$basename})"; 689 | } 690 | 691 | $util_implementations = <<write_file("%%projectdir/util_implementations.h", $util_implementations); 701 | 702 | ?> -------------------------------------------------------------------------------- /sources/crypto_onetimeauth_poly1305_ref_auth.c: -------------------------------------------------------------------------------- 1 | /* 2 | 20080912 3 | D. J. Bernstein 4 | Public domain. 5 | */ 6 | 7 | #include "crypto_onetimeauth.h" 8 | 9 | static void add(unsigned int h[17],const unsigned int c[17]) 10 | { 11 | unsigned int j; 12 | unsigned int u; 13 | u = 0; 14 | for (j = 0;j < 17;++j) { u += h[j] + c[j]; h[j] = u & 255; u >>= 8; } 15 | } 16 | 17 | static void squeeze(unsigned int h[17]) 18 | { 19 | unsigned int j; 20 | unsigned int u; 21 | u = 0; 22 | for (j = 0;j < 16;++j) { u += h[j]; h[j] = u & 255; u >>= 8; } 23 | u += h[16]; h[16] = u & 3; 24 | u = 5 * (u >> 2); 25 | for (j = 0;j < 16;++j) { u += h[j]; h[j] = u & 255; u >>= 8; } 26 | u += h[16]; h[16] = u; 27 | } 28 | 29 | static const unsigned int minusp[17] = { 30 | 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 252 31 | } ; 32 | 33 | static void freeze(unsigned int h[17]) 34 | { 35 | unsigned int horig[17]; 36 | unsigned int j; 37 | unsigned int negative; 38 | for (j = 0;j < 17;++j) horig[j] = h[j]; 39 | add(h,minusp); 40 | negative = -(h[16] >> 7); 41 | for (j = 0;j < 17;++j) h[j] ^= negative & (horig[j] ^ h[j]); 42 | } 43 | 44 | static void mulmod(unsigned int h[17],const unsigned int r[17]) 45 | { 46 | unsigned int hr[17]; 47 | unsigned int i; 48 | unsigned int j; 49 | unsigned int u; 50 | 51 | for (i = 0;i < 17;++i) { 52 | u = 0; 53 | for (j = 0;j <= i;++j) u += h[j] * r[i - j]; 54 | for (j = i + 1;j < 17;++j) u += 320 * h[j] * r[i + 17 - j]; 55 | hr[i] = u; 56 | } 57 | for (i = 0;i < 17;++i) h[i] = hr[i]; 58 | squeeze(h); 59 | } 60 | 61 | int crypto_onetimeauth(unsigned char *out,const unsigned char *in,unsigned long long inlen,const unsigned char *k) 62 | { 63 | unsigned int j; 64 | unsigned int r[17]; 65 | unsigned int h[17]; 66 | unsigned int c[17]; 67 | 68 | r[0] = k[0]; 69 | r[1] = k[1]; 70 | r[2] = k[2]; 71 | r[3] = k[3] & 15; 72 | r[4] = k[4] & 252; 73 | r[5] = k[5]; 74 | r[6] = k[6]; 75 | r[7] = k[7] & 15; 76 | r[8] = k[8] & 252; 77 | r[9] = k[9]; 78 | r[10] = k[10]; 79 | r[11] = k[11] & 15; 80 | r[12] = k[12] & 252; 81 | r[13] = k[13]; 82 | r[14] = k[14]; 83 | r[15] = k[15] & 15; 84 | r[16] = 0; 85 | 86 | for (j = 0;j < 17;++j) h[j] = 0; 87 | 88 | while (inlen > 0) { 89 | for (j = 0;j < 17;++j) c[j] = 0; 90 | for (j = 0;(j < 16) && (j < inlen);++j) c[j] = in[j]; 91 | c[j] = 1; 92 | in += j; inlen -= j; 93 | add(h,c); 94 | mulmod(h,r); 95 | } 96 | 97 | freeze(h); 98 | 99 | for (j = 0;j < 16;++j) c[j] = k[j + 16]; 100 | c[16] = 0; 101 | add(h,c); 102 | for (j = 0;j < 16;++j) out[j] = h[j]; 103 | return 0; 104 | } 105 | --------------------------------------------------------------------------------