├── Makefile
├── README
├── README.md
├── arm_asm.S
├── arm_asm.h
├── benchmark.c
├── memcpy-hybrid.S
├── memcpy-hybrid.h
├── new_arm.S
└── new_arm.h


/Makefile:
--------------------------------------------------------------------------------
 1 | # PLATFORM must one of the following list and is used to select the memcpy/
 2 | # memset variants used in the replacement library (libfastarm.so)
 3 | # - RPI selects optimizations for the armv6-based Raspberry Pi with a
 4 | #   preload offset of 96 bytes.
 5 | # - ARMV7_32 selects a cache line size of 32, suitable for most Cortex
 6 | #   platforms. The used preload offset is 192 bytes.
 7 | # - ARMV7_64 selects a cache line size of 64 bytes, suitable for potential
 8 | #   Cortex platforms in which all cache line fills (including from DRAM) are
 9 | #   64 bytes. The used preload offset is 192 bytes.
10 | # - NEON_32 selects NEON optimizations with a cache line of 32 bytes.
11 | #   The used preload offset is 192 bytes.
12 | # - NEON_64 selects NEON optimizations with a cahce line size of 64 bytes.
13 | #   The used preload offset is 192 bytes.
14 | # - NEON_AUTO selects NEON optimizations for Cortex cores with a suitably
15 | #   advanced automatic prefetcher that most preload instructions are unnecessary.
16 | #   Only early preloads are generated.
17 | # Uncomment the THUMBFLAGS definition to compile in ARM mode as opposed to Thumb2
18 | 
19 | PLATFORM = NEON_32
20 | THUMBFLAGS = -march=armv7-a -Wa,-march=armv7-a -mthumb -Wa,-mthumb \
21 |  -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB
22 | BENCHMARK_CONFIG_FLAGS = -DINCLUDE_MEMCPY_HYBRID # -DINCLUDE_LIBARMMEM_MEMCPY
23 | #LIBARMMEM = -larmmem
24 | CORTEX_STRINGS_MEMCPY_HYBRID = memcpy-hybrid.o
25 | CFLAGS = -std=gnu99 -Ofast -Wall $(BENCHMARK_CONFIG_FLAGS)
26 | PCFLAGS = -std=gnu99 -O -Wall $(BENCHMARK_CONFIG_FLAGS) -pg -ggdb
27 | 
28 | all : benchmark libfastarm.so
29 | 
30 | benchmark : benchmark.o arm_asm.o new_arm.o $(CORTEX_STRINGS_MEMCPY_HYBRID)
31 | 	$(CC) $(CFLAGS) benchmark.o arm_asm.o new_arm.o \
32 | 	$(CORTEX_STRINGS_MEMCPY_HYBRID) -o benchmark -lm -lrt $(LIBARMMEM)
33 | 
34 | benchmarkp : benchmark.c arm_asm.S
35 | 	$(CC) $(PCFLAGS) benchmark.c arm_asm.S new_arm.S -o benchmarkp -lc -lm -lrt $(LIBARMMEM)
36 | 
37 | install_memcpy_replacement : libfastarm.so
38 | 	install -m 0755 libfastarm.so /usr/lib/arm-linux-gnueabihf/libfastarm.so
39 | 	@echo 'To enable the use of the enhanced memcpy by applications, edit or'
40 | 	@echo 'create the file /etc/ld.so.preload so that it contains the line:'
41 | 	@echo '/usr/lib/arm-linux-gnueabihf/libfastarm.so'
42 | 	@echo 'On the RPi platform, references to libcofi_rpi.so should be commented'
43 | 	@echo 'out or deleted.'
44 | 
45 | libfastarm.so : memcpy_replacement.o
46 | 	$(CC) -o libfastarm.so -shared memcpy_replacement.o
47 | 
48 | memcpy_replacement.o : new_arm.S
49 | 	$(CC) -c -s -x assembler-with-cpp $(THUMBFLAGS) \
50 | -DMEMCPY_REPLACEMENT_$(PLATFORM) -DMEMSET_REPLACEMENT_$(PLATFORM) \
51 | -o memcpy_replacement.o new_arm.S
52 | 
53 | clean :
54 | 	rm -f benchmark
55 | 	rm -f benchmark.o
56 | 	rm -f benchmarkp
57 | 	rm -f arm_asm.s
58 | 	rm -f arm_asm.o
59 | 	rm -f new_arm.o
60 | 	rm -f memcpy_replacement.o
61 | 	rm -f libfastarm.so
62 | 
63 | benchmark.o : benchmark.c arm_asm.h
64 | 
65 | arm_asm.o : arm_asm.S arm_asm.h
66 | 
67 | new_arm.o : new_arm.S new_arm.h
68 | 
69 | memcpy-hybrid.o : memcpy-hybrid.S
70 | 
71 | .c.o : 
72 | 	$(CC) -c $(CFLAGS) $< -o $@
73 | 
74 | .S.o :
75 | 	$(CC) -c -s $(CFLAGS) $(THUMBFLAGS) $< -o $@
76 | 
77 | .c.s :
78 | 	$(CC) -S $(CFLAGS) $< -o $@
79 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | fastarm
 2 | 
 3 | This toolkit contains a set of fast memcpy/memset variants for ARM
 4 | platforms. They either use the standard register file, or optionally
 5 | NEON instructions,
 6 | 
 7 | Several basic families of variants are provided; the current ones are
 8 | the "new memcpy" variants which are the default for memcpy replacement,
 9 | which generally do not overfetch beyond the source region and can be
10 | configured to use unaligned memory access for small sizes, or to use
11 | strictly aligned memory access. This family can also be configured to
12 | include a fast path for smaller sizes (this is the default), disabling
13 | this results in smaller code size at the expense of worse performance
14 | for small sizes. NEON optimized versions, which are generally faster
15 | with reduced code size, are also provided.
16 | 
17 | To compile the benchmark program, run 'make'. This will compile in a
18 | plethora of variants with different preload strategies, block sizes,
19 | alignment etc.
20 | 
21 | A benchmark program to compare various memcpy variants is provided. Try
22 | something like "./benchmark --memcpy ad --all". (Use --memcpy al on the
23 | Raspberry Pi platform).
24 | 
25 | To compile a memcpy replacement library, set PLATFORM to one of the
26 | values described at the beginning of the Makefile. This selects the
27 | cache line size to use and whether to use NEON versions.
28 | 
29 | Optionally disable Thumb2 mode compilation by commenting out the THUMBFLAGS
30 | definition. It must be disabled on the Raspberry Pi.
31 | 
32 | Then run:
33 | 
34 |     sudo make install_memcpy_replacement
35 | 
36 | The replacement memcpy/memset shared library will be installed into
37 | /usr/lib/arm-linux-gnueabihf/ as libfastarm.so.
38 | 
39 | To enable the use of the replacement memcpy in applications, create or edit
40 | the file /etc/ld.so.preload so that it contains the line:
41 | 
42 |     /usr/lib/arm-linux-gnueabihf/libfastarm.so
43 | 
44 | On the RPi platform, references to libcofi_rpi.so should be commented out
45 | or deleted. The new memcpy should now be activated for newly launched
46 | programs. To be sure, reboot or run:
47 | 
48 |     sudo ldconfig
49 | 
50 | To revert to the default optimized memcpy on the RPi platform,
51 | edit /etc/ld.so.preload so that it contains the line:
52 | 
53 |     /usr/lib/arm-linux-gnueabihf/libcofi_rpi.so
54 | 
55 | instead of the one using libfastarm.so.
56 | 
57 | Note on cache line size:
58 | 
59 | Although assuming a preload line size 64 bytes is a little faster on several
60 | Cortex platforms for small to moderate sizes, when accessing DRAM
61 | with larger sizes assuming 32 byte preloads seems to be faster. On earlier
62 | Cortex A9 models, 32 byte preloads are required for good performance in all
63 | cases.
64 | 
65 | Notes on performance with and without NEON:
66 | 
67 | For NEON-based memcpy, a significant benefit is seen on the tested Cortex A8
68 | platform for unaligned copies in cache memory and for aligned and unaligned
69 | copies in DRAM. Performance for aligned copies in cache memory is relatively
70 | similar to the optimized non-NEON function.
71 | 
72 | Results in MB/s on a Cortex A8, with Thumb2 mode enabled, of
73 | standard libc (Debian unstable), armv7 and NEON optimized memcpy
74 | variants with line size of 32 bytes:
75 | 
76 | 		libc	armv7	NEON
77 | test 0		522	549	567
78 | test 1		329	377	378
79 | test 2		434	430	513
80 | test 28		351	361	458
81 | test 29		246	248	358
82 | test 43		467	512	581
83 | 
84 | Test 0 in the benchmark program tests word-aligned requests with
85 | sizes that are a power of 2 up to 4096 bytes distributed according
86 | to a power law.
87 | Test 1 in the benchmark program tests word-aligned requests with
88 | sizes up to 1024 that are a multiple of 4, distributed according
89 | to a power law.
90 | Test 2 in the benchmark program tests unaligned requests with sizes
91 | up to 1023 bytes.
92 | Test 28 in the benchmark program tests word aligned requests in DRAM
93 | with sizes up to 1024 bytes.
94 | Test 29 in the benchmark program tests word aligned requests in DRAM
95 | with sizes up to 256 bytes.
96 | Test 43 in the benchmark program tests page aligned requests in DRAM
97 | of size 4096 (copying a memory page).
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | fastarm
2 | =======
3 | 
4 | Experimental memcpy speed toolkit for ARM CPUs. Provides optimized replacement
5 | memcpy and memset functions for armv6/armv7 platforms without NEON and NEON-
6 | optimized versions for armv7 platforms with NEON.
7 | 
8 | 


--------------------------------------------------------------------------------
/arm_asm.S:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * Copyright © 2006-2008, 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
   3 |  *
   4 |  * Permission is hereby granted, free of charge, to any person obtaining a
   5 |  * copy of this software and associated documentation files (the "Software"),
   6 |  * to deal in the Software without restriction, including without limitation
   7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 |  * and/or sell copies of the Software, and to permit persons to whom the
   9 |  * Software is furnished to do so, subject to the following conditions:
  10 |  *
  11 |  * The above copyright notice and this permission notice (including the next
  12 |  * paragraph) shall be included in all copies or substantial portions of the
  13 |  * Software.
  14 |  *
  15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21 |  * DEALINGS IN THE SOFTWARE.
  22 |  *
  23 |  * Copyright 2013 Harm Hanemaaijer <fgenfb@yahoo.com>
  24 |  *
  25 |  * 1. Add ".type <function_name>, function" to function definition macro, which
  26 |  *    was required for correct linkage on my platform.
  27 |  * 2. Add non-overfetching memcpy version with a plethora of optimizations and variants using
  28 |  *    macros.
  29 |  *    To do: -- More complete implementation of write_align == 64 for unaligned case.
  30 |  *
  31 |  * On the RPi platform, a good choice is armv5te_no_overfetch_align_16_block_write_16_preload_early_128,
  32 |  * closely followed by armv5te_no_overfetch_align_16_block_write_16_preload_early_96. For
  33 |  * CPU-cache based work loads armv5te_no_overfetch_align_16_block_write_16_preload_96 might be
  34 |  * a little faster.
  35 |  *
  36 |  * On the Allwinner A10 platform, with the reworked version a variant with cache line size of 64,
  37 |  * memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_192, seems to be the
  38 |  * best performer.
  39 |  *
  40 |  * On the Allwinner platform, the optimized memcpy is faster; on the RPi libcofi does relatively well
  41 |  * and the optimal memcpy depends on the application.
  42 |  */
  43 | 
  44 | /* Prevent the stack from becoming executable */
  45 | #if defined(__linux__) && defined(__ELF__)
  46 | .section .note.GNU-stack,"",%progbits
  47 | #endif
  48 | 
  49 | #ifdef __arm__
  50 | 
  51 | .text
  52 | .syntax unified
  53 | .fpu neon
  54 | .arch armv7a
  55 | .object_arch armv4
  56 | .arm
  57 | .altmacro
  58 | .p2align 2
  59 | 
  60 | /******************************************************************************/
  61 | 
  62 | .macro asm_function function_name
  63 |     .global \function_name
  64 | .func \function_name
  65 | .type \function_name, function
  66 | .p2align 5
  67 | \function_name:
  68 | .endm
  69 | 
  70 | /******************************************************************************/
  71 | 
  72 | #if !defined(MEMCPY_REPLACEMENT_SUNXI) && !defined(MEMCPY_REPLACEMENT_RPI)
  73 | 
  74 | /*
  75 |  * Helper macro for memcpy function, it can copy data from source (r1) to 
  76 |  * destination (r0) buffers fixing alignment in the process. Destination
  77 |  * buffer should be aligned already (4 bytes alignment is required.
  78 |  * Size of the block to copy is in r2 register
  79 |  */
  80 | .macro  UNALIGNED_MEMCPY shift
  81 |     sub     r1, #(\shift)
  82 |     ldr     ip, [r1], #4
  83 | 
  84 |     tst     r0, #4
  85 |     movne   r3, ip, lsr #(\shift * 8)
  86 |     ldrne   ip, [r1], #4
  87 |     subne   r2, r2, #4
  88 |     orrne   r3, r3, ip, asl #(32 - \shift * 8)
  89 |     strne   r3, [r0], #4
  90 | 
  91 |     tst     r0, #8
  92 |     movne   r3, ip, lsr #(\shift * 8)
  93 |     ldmiane r1!, {r4, ip}
  94 |     subne   r2, r2, #8
  95 |     orrne   r3, r3, r4, asl #(32 - \shift * 8)
  96 |     movne   r4, r4, lsr #(\shift * 8)
  97 |     orrne   r4, r4, ip, asl #(32 - \shift * 8)
  98 |     stmiane r0!, {r3-r4}
  99 |     cmp     r2, #32
 100 |     blt     3f
 101 |     pld     [r1, #48]
 102 |     stmfd   sp!, {r7, r8, r9, r10, r11}
 103 |     add     r3, r1, #128
 104 |     bic     r3, r3, #31
 105 |     sub     r9, r3, r1
 106 | 1:
 107 |     pld     [r1, r9]
 108 |     subs    r2, r2, #32
 109 |     movge   r3, ip, lsr #(\shift * 8)
 110 |     ldmiage r1!, {r4-r6, r7, r8, r10, r11, ip}
 111 |     orrge   r3, r3, r4, asl #(32 - \shift * 8)
 112 |     movge   r4, r4, lsr #(\shift * 8)
 113 |     orrge   r4, r4, r5, asl #(32 - \shift * 8)
 114 |     movge   r5, r5, lsr #(\shift * 8)
 115 |     orrge   r5, r5, r6, asl #(32 - \shift * 8)
 116 |     movge   r6, r6, lsr #(\shift * 8)
 117 |     orrge   r6, r6, r7, asl #(32 - \shift * 8)
 118 |     stmiage r0!, {r3-r6}
 119 |     movge   r7, r7, lsr #(\shift * 8)
 120 |     orrge   r7, r7, r8, asl #(32 - \shift * 8)
 121 |     movge   r8, r8, lsr #(\shift * 8)
 122 |     orrge   r8, r8, r10, asl #(32 - \shift * 8)
 123 |     movge   r10, r10, lsr #(\shift * 8)
 124 |     orrge   r10, r10, r11, asl #(32 - \shift * 8)
 125 |     movge   r11, r11, lsr #(\shift * 8)
 126 |     orrge   r11, r11, ip, asl #(32 - \shift * 8)
 127 |     stmiage r0!, {r7, r8, r10, r11}
 128 |     bgt     1b
 129 | 2:
 130 |     ldmfd   sp!, {r7, r8, r9, r10, r11}
 131 | 3:  /* copy remaining data */
 132 |     tst     r2, #16
 133 |     movne   r3, ip, lsr #(\shift * 8)
 134 |     ldmiane r1!, {r4-r6, ip}
 135 |     orrne   r3, r3, r4, asl #(32 - \shift * 8)
 136 |     movne   r4, r4, lsr #(\shift * 8)
 137 |     orrne   r4, r4, r5, asl #(32 - \shift * 8)
 138 |     movge   r5, r5, lsr #(\shift * 8)
 139 |     orrge   r5, r5, r6, asl #(32 - \shift * 8)
 140 |     movge   r6, r6, lsr #(\shift * 8)
 141 |     orrge   r6, r6, ip, asl #(32 - \shift * 8)
 142 |     stmiane r0!, {r3-r6}
 143 | 
 144 |     tst     r2, #8
 145 |     movne   r3, ip, lsr #(\shift * 8)
 146 |     ldmiane r1!, {r4, ip}
 147 |     orrne   r3, r3, r4, asl #(32 - \shift * 8)
 148 |     movne   r4, r4, lsr #(\shift * 8)
 149 |     orrne   r4, r4, ip, asl #(32 - \shift * 8)
 150 |     stmiane r0!, {r3-r4}
 151 | 
 152 |     tst     r2, #4
 153 |     movne   r3, ip, lsr #(\shift * 8)
 154 |     ldrne   ip, [r1], #4
 155 |     sub     r1, r1, #(4 - \shift)
 156 |     orrne   r3, r3, ip, asl #(32 - \shift * 8)
 157 |     strne   r3, [r0], #4
 158 | 
 159 |     tst     r2, #2
 160 |     ldrbne  r3, [r1], #1
 161 |     ldrbne  r4, [r1], #1
 162 |     ldr     r5, [sp], #4
 163 |     strbne  r3, [r0], #1
 164 |     strbne  r4, [r0], #1
 165 | 
 166 |     tst     r2, #1
 167 |     ldrbne  r3, [r1], #1
 168 |     ldr     r6, [sp], #4
 169 |     strbne  r3, [r0], #1
 170 | 
 171 |     pop     {r0, r4}
 172 | 
 173 |     bx      lr
 174 | .endm
 175 | 
 176 | /*
 177 |  * Memcpy function with Raspberry Pi specific aligned prefetch, based on
 178 |  * https://garage.maemo.org/plugins/scmsvn/viewcvs.php/mplayer/trunk/fastmem-arm9/fastmem-arm9.S
 179 |  */
 180 | asm_function memcpy_armv5te
 181 |     cmp     r2, #20
 182 |     blt     9f
 183 |     /* copy data until destination address is 4 bytes aligned */
 184 |     tst     r0, #1
 185 |     ldrbne  r3, [r1], #1
 186 |     stmfd   sp!, {r0, r4}
 187 |     subne   r2, r2, #1
 188 |     strbne  r3, [r0], #1
 189 |     tst     r0, #2
 190 |     ldrbne  r3, [r1], #1
 191 |     ldrbne  r4, [r1], #1
 192 |     stmfd   sp!, {r5, r6}
 193 |     subne   r2, r2, #2
 194 |     orrne   r3, r3, r4, asl #8
 195 |     strhne  r3, [r0], #2
 196 |     /* destination address is 4 bytes aligned */
 197 |     /* now we should handle 4 cases of source address alignment */
 198 |     tst     r1, #1
 199 |     bne     6f
 200 |     tst     r1, #2
 201 |     bne     7f
 202 | 
 203 |     /* both source and destination are 4 bytes aligned */
 204 |     stmfd   sp!, {r7, r8, r9, r10, r11}
 205 |     tst     r0, #4
 206 |     ldrne   r4, [r1], #4
 207 |     subne   r2, r2, #4
 208 |     strne   r4, [r0], #4
 209 |     tst     r0, #8
 210 |     ldmiane r1!, {r3-r4}
 211 |     add     r9, r1, #96
 212 |     subne   r2, r2, #8
 213 |     bic     r9, r9, #31
 214 |     stmiane r0!, {r3-r4}
 215 |     sub     r9, r9, r1
 216 | 1:
 217 |     subs    r2, r2, #32
 218 |     ldmiage r1!, {r3-r6, r7, r8, r10, r11}
 219 |     pld     [r1, r9]
 220 |     stmiage r0!, {r3-r6}
 221 |     stmiage r0!, {r7, r8, r10, r11}
 222 |     bgt     1b
 223 | 2:
 224 |     ldmfd   sp!, {r7, r8, r9, r10, r11}
 225 |     tst     r2, #16
 226 |     ldmiane r1!, {r3-r6}
 227 |     stmiane r0!, {r3-r6}
 228 |     tst     r2, #8
 229 |     ldmiane r1!, {r3-r4}
 230 |     stmiane r0!, {r3-r4}
 231 |     tst     r2, #4
 232 |     ldrne   r3, [r1], #4
 233 |     mov     ip, r0
 234 |     strne   r3, [ip], #4
 235 |     tst     r2, #2
 236 |     ldrhne  r3, [r1], #2
 237 |     ldmfd   sp!, {r5, r6}
 238 |     strhne  r3, [ip], #2
 239 |     tst     r2, #1
 240 |     ldrbne  r3, [r1], #1
 241 |     ldmfd   sp!, {r0, r4}
 242 |     strbne  r3, [ip], #1
 243 | 
 244 |     bx      lr
 245 | 
 246 | 6:
 247 |     tst    r1, #2
 248 |     bne    8f
 249 |     UNALIGNED_MEMCPY 1
 250 | 7:
 251 |     UNALIGNED_MEMCPY 2
 252 | 8:
 253 |     UNALIGNED_MEMCPY 3
 254 | 9:
 255 |     stmfd  sp!, {r0, r4}
 256 | 1:  subs   r2, r2, #3
 257 |     ldrbge ip, [r0]
 258 |     ldrbge r3, [r1], #1
 259 |     ldrbge r4, [r1], #1
 260 |     ldrbge ip, [r1], #1
 261 |     strbge r3, [r0], #1
 262 |     strbge r4, [r0], #1
 263 |     strbge ip, [r0], #1
 264 |     bge    1b
 265 |     adds   r2, r2, #2
 266 |     ldrbge r3, [r1], #1
 267 |     mov    ip, r0
 268 |     ldr    r0, [sp], #4
 269 |     strbge r3, [ip], #1
 270 |     ldrbgt r3, [r1], #1
 271 |     ldr    r4, [sp], #4
 272 |     strbgt r3, [ip], #1
 273 |     bx     lr
 274 | .endfunc
 275 | 
 276 | #endif
 277 | 
 278 | /*
 279 |  * PRELOAD_CATCH_UP enables catching up the early preload offset with the preload offset in
 280 |  * the main loop.
 281 |  */
 282 | 
 283 | #define PRELOAD_CATCH_UP
 284 | 
 285 | /*
 286 |  * CHECK_EARLY_PRELOADS enables checks to avoid overfetching beyond the source region when
 287 |  * doing early preloads. This is currently only implemented for the unaligned case.
 288 |  * Due to the overhead it adds this option may not improve performance.
 289 |  */
 290 | // #define CHECK_EARLY_PRELOADS
 291 | 
 292 | /*
 293 |  * Allow unaligned memory access.
 294 |  */
 295 | 
 296 | #define UNALIGNED_ACCESS
 297 | 
 298 | /*
 299 |  * Helper macro for non-overfetching version.
 300 |  *
 301 |  * If preload_early == 1,
 302 |  * r6 is the address of the 32-byte aligned region containing the last source byte.
 303 |  * r3 is the address of the 32-byte aligned region where the first preload occurred, preloads
 304 |  * have occurred up to [r3 + line_size].
 305 |  *
 306 |  * Registers up to r7 have been saved on the stack.
 307 |  */
 308 | 
 309 | .macro  UNALIGNED_MEMCPY_VARIANT granularity, shift, line_size, write_align, block_write_size, preload_offset, preload_early, overfetch
 310 |     sub     r1, #(\shift)
 311 | .if \preload_early == 1
 312 |     add     r7, r3, #(\line_size * 2)
 313 | .endif
 314 |     ldr     ip, [r1], #4
 315 | .if \preload_early == 1
 316 | #ifdef CHECK_EARLY_PRELOADS
 317 | .if \overfetch == 0
 318 |     cmp     r6, r7
 319 |     /* Only preload if the source region extends into it. */
 320 |     blt     5f
 321 | .endif
 322 | #endif
 323 |     pld     [r7]
 324 | 5:
 325 | .endif
 326 | 
 327 |     tst     r0, #4
 328 |     movne   r3, ip, lsr #(\shift * 8)
 329 |     ldrne   ip, [r1], #4
 330 |     subne   r2, r2, #4
 331 |     orrne   r3, r3, ip, asl #(32 - \shift * 8)
 332 |     strne   r3, [r0], #4
 333 | 
 334 |     tst     r0, #8
 335 |     movne   r3, ip, lsr #(\shift * 8)
 336 |     ldmiane r1!, {r4, ip}
 337 |     subne   r2, r2, #8
 338 |     orrne   r3, r3, r4, asl #(32 - \shift * 8)
 339 |     movne   r4, r4, lsr #(\shift * 8)
 340 |     orrne   r4, r4, ip, asl #(32 - \shift * 8)
 341 |     stmiane r0!, {r3-r4}
 342 | 
 343 | .if \write_align >= 32
 344 |     tst     r0, #16
 345 |     movne   r3, ip, lsr #(\shift * 8)
 346 |     beq     5f
 347 |     ldmia   r1!, {r4-r6, ip}
 348 |     sub     r2, r2, #16
 349 |     orr     r3, r3, r4, asl #(32 - \shift * 8)
 350 |     mov     r4, r4, lsr #(\shift * 8)
 351 | .if \write_align == 32
 352 |     cmp     r2, #32
 353 | .endif
 354 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
 355 |     mov     r5, r5, lsr #(\shift * 8)
 356 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
 357 |     mov     r6, r6, lsr #(\shift * 8)
 358 |     orr     r6, r6, ip, asl #(32 - \shift * 8)
 359 |     stmia   r0!, {r3-r6}
 360 | .if \write_align == 32
 361 |     blt     3f
 362 |     b       1f
 363 | .endif
 364 | 5:
 365 | .endif
 366 | 
 367 | .if \write_align == 64
 368 |     tst     r0, #32
 369 |     movne   r3, ip, lsr #(\shift * 8)
 370 |     beq     5f
 371 |     ldmia   r1!, {r4-r6, ip}
 372 |     sub     r2, r2, #32
 373 |     orr     r3, r3, r4, asl #(32 - \shift * 8)
 374 |     mov     r4, r4, lsr #(\shift * 8)
 375 |     cmp     r2, #32
 376 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
 377 |     mov     r5, r5, lsr #(\shift * 8)
 378 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
 379 |     mov     r6, r6, lsr #(\shift * 8)
 380 |     orr     r6, r6, ip, asl #(32 - \shift * 8)
 381 |     stmia   r0!, {r3-r6}
 382 |     mov     r3, ip, lsr #(\shift * 8)
 383 |     ldmia   r1!, {r4-r6, ip}
 384 |     orr     r3, r3, r4, asl #(32 - \shift * 8)
 385 |     mov     r4, r4, lsr #(\shift * 8)
 386 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
 387 |     mov     r5, r5, lsr #(\shift * 8)
 388 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
 389 |     mov     r6, r6, lsr #(\shift * 8)
 390 |     orr     r6, r6, ip, asl #(32 - \shift * 8)
 391 |     stmia   r0!, {r3-r6}
 392 |     blt     3f
 393 |     b       1f
 394 | 5:
 395 | .endif
 396 | 
 397 |     cmp     r2, #32
 398 |     blt     3f
 399 | 1:
 400 | .if \preload_offset != 0
 401 | .if \overfetch == 1
 402 |     cmp     r2, #64
 403 | .else
 404 |     cmp     r2, #\preload_offset
 405 | .endif
 406 | .endif
 407 |     stmfd   sp!, {r8, r9, r10, r11}
 408 | .if \preload_offset != 0
 409 |     add     r10, r1, #\preload_offset
 410 | #ifdef PRELOAD_CATCH_UP
 411 | .if \preload_early == 1 && \preload_offset >= 64 && \block_write_size >= 16
 412 |     add     r7, r7, #(\line_size * 2)
 413 | .endif
 414 | #endif
 415 |     bic     r10, r10, #(\line_size - 1)
 416 |     sub     r9, r10, r1
 417 | .if \overfetch == 0
 418 |     /* If there are <= preload_offset bytes to go, skip the main loop. */
 419 |     ble     4f
 420 | .else
 421 |     blt     1f
 422 | .endif
 423 | .if \preload_early == 1 && \preload_offset >= 64 && \block_write_size >= 16
 424 |     /*
 425 |      * At this point, if overfetch is 0, there are at least preload_offset
 426 |      * bytes left, so when CHECK_EARLY_PRELOAD is set, we only need to
 427 |      * perform a check if it is possible that the preload overfetches,
 428 |      * given that the upcoming early preload is the 4th one (making a
 429 |      * total of line_size * 4 byte preloaded from the 32-byte aligned
 430 |      * start address).
 431 |      */
 432 | #ifdef PRELOAD_CATCH_UP
 433 | #ifdef CHECK_EARLY_PRELOADS
 434 | .if \preload_offset < (\line_size * 4)
 435 |     add     r3, r1, r2
 436 |     mov     r11, r7
 437 |     sub     r3, r3, #1
 438 |     sub     r7, #\line_size
 439 |     bic     r3, r3, #(\line_size - 1)
 440 |     cmp     r7, r3
 441 |     add     r7, #\line_size
 442 |     bgt     5f
 443 |     pld     [r7]
 444 | 5:
 445 | .else
 446 |     mov     r11, r7
 447 |     pld     [r7, #-\line_size]
 448 | .endif
 449 | #else
 450 |     mov     r11, r7
 451 |     pld     [r7, #-\line_size]
 452 | #endif
 453 | #else
 454 | #ifdef CHECK_EARLY_PRELOADS
 455 | .if \preload_offset < (\line_size * 4)
 456 |     add     r3, r1, r2
 457 |     add     r7, #\line_size
 458 |     sub     r3, r3, #1
 459 |     bic     r3, r3, #(\line_size - 1)
 460 |     cmp     r7, r3
 461 |     bgt     5f
 462 |     pld     [r7]
 463 | 5:
 464 | .else
 465 |     pld     [r7, #\line_size]
 466 | .endif
 467 | #else
 468 |     pld     [r7, #\line_size]
 469 | #endif
 470 | #endif
 471 | #ifdef PRELOAD_CATCH_UP
 472 |     /*
 473 |      * The last preload already done is at [r11 - line_size].
 474 |      * The next preload in the main loop will happen at [r10].
 475 |      * If r11 < r10, we want to do an extra preload at [r11].
 476 |      * Note if write alignment is 64, it may become unaligned.
 477 |      */
 478 | 18:
 479 |     cmp     r11, r10
 480 |     movlt   r3, ip, lsr #(\shift * 8)
 481 |     ldmialt r1!, {r4-r6, r7}
 482 |     add     r11, #64
 483 |     orrlt   r3, r3, r4, asl #(32 - \shift * 8)
 484 |     movlt   r4, r4, lsr #(\shift * 8)
 485 |     bge     1f
 486 |     cmp     r2, #(\preload_offset + 32)
 487 |     pld     [r11, #-64]
 488 |     sub     r2, r2, #32
 489 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
 490 |     mov     r5, r5, lsr #(\shift * 8)
 491 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
 492 |     mov     r6, r6, lsr #(\shift * 8)
 493 |     orr     r6, r6, r7, asl #(32 - \shift * 8)
 494 |     mov     r7, r7, lsr #(\shift * 8)
 495 |     stmia   r0!, {r3-r6}
 496 |     mov     r3, r7
 497 |     ldmia   r1!, {r4, r5, r6, ip}
 498 |     orr     r3, r3, r4, asl #(32 - \shift * 8)
 499 |     add     r10, r1, r9
 500 |     mov     r4, r4, lsr #(\shift * 8)
 501 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
 502 |     mov     r5, r5, lsr #(\shift * 8)
 503 | .if \line_size == 32
 504 |     pld     [r11, #-32]
 505 | .endif
 506 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
 507 |     mov     r6, r6, lsr #(\shift * 8)
 508 |     orr     r6, r6, ip, asl #(32 - \shift * 8)
 509 |     stmia   r0!, {r3, r4, r5, r6}
 510 |     bgt     18b
 511 | .if \overfetch == 0
 512 |     b       4f
 513 | .endif
 514 | #endif
 515 | .endif
 516 | 1:
 517 | .if \line_size == 64 || \write_align == 64
 518 |     /* Process 64 bytes at a time. */
 519 | .if \overfetch == 1
 520 |     cmp     r2, #(64 + 64)
 521 | .else
 522 |     cmp     r2, #(\preload_offset + 64)
 523 | .endif
 524 |     pld     [r1, r9]
 525 |     mov     r3, ip, lsr #(\shift * 8)
 526 |     ldmia   r1!, {r4-r6, r7, r8, r10, r11, ip}
 527 |     orr     r3, r3, r4, asl #(32 - \shift * 8)
 528 |     mov     r4, r4, lsr #(\shift * 8)
 529 |     sub     r2, r2, #32
 530 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
 531 |     mov     r5, r5, lsr #(\shift * 8)
 532 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
 533 |     mov     r6, r6, lsr #(\shift * 8)
 534 |     orr     r6, r6, r7, asl #(32 - \shift * 8)
 535 |     mov     r7, r7, lsr #(\shift * 8)
 536 | .if \block_write_size == 16
 537 |     stmia   r0!, {r3-r6}
 538 | .endif
 539 |     orr     r7, r7, r8, asl #(32 - \shift * 8)
 540 |     mov     r8, r8, lsr #(\shift * 8)
 541 |     orr     r8, r8, r10, asl #(32 - \shift * 8)
 542 |     mov     r10, r10, lsr #(\shift * 8)
 543 | .if \block_write_size == 8
 544 |     stmia   r0!, {r7-r8}
 545 | .endif
 546 |     orr     r10, r10, r11, asl #(32 - \shift * 8)
 547 |     mov     r11, r11, lsr #(\shift * 8)
 548 |     orr     r11, r11, ip, asl #(32 - \shift * 8)
 549 | .if \block_write_size == 32
 550 |     stmia   r0!, {r3-r6, r7, r8, r10, r11}
 551 | .endif
 552 | .if \block_write_size == 16
 553 |     stmia   r0!, {r7, r8, r10, r11}
 554 | .endif
 555 | .if \line_size == 32
 556 |     pld     [r1, r9]
 557 | .endif
 558 |     mov     r3, ip, lsr #(\shift * 8)
 559 |     ldmia   r1!, {r4-r6, r7, r8, r10, r11, ip}
 560 |     orr     r3, r3, r4, asl #(32 - \shift * 8)
 561 |     mov     r4, r4, lsr #(\shift * 8)
 562 |     sub     r2, r2, #32
 563 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
 564 |     mov     r5, r5, lsr #(\shift * 8)
 565 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
 566 |     mov     r6, r6, lsr #(\shift * 8)
 567 |     orr     r6, r6, r7, asl #(32 - \shift * 8)
 568 |     mov     r7, r7, lsr #(\shift * 8)
 569 | .if \block_write_size == 16
 570 |     stmia   r0!, {r3-r6}
 571 | .endif
 572 |     orr     r7, r7, r8, asl #(32 - \shift * 8)
 573 |     mov     r8, r8, lsr #(\shift * 8)
 574 |     orr     r8, r8, r10, asl #(32 - \shift * 8)
 575 |     mov     r10, r10, lsr #(\shift * 8)
 576 | .if \block_write_size == 8
 577 |     stmia   r0!, {r7-r8}
 578 | .endif
 579 |     orr     r10, r10, r11, asl #(32 - \shift * 8)
 580 |     mov     r11, r11, lsr #(\shift * 8)
 581 |     orr     r11, r11, ip, asl #(32 - \shift * 8)
 582 | .if \block_write_size == 32
 583 |     stmia   r0!, {r3-r6, r7, r8, r10, r11}
 584 | .endif
 585 | .if \block_write_size == 16
 586 |     stmia   r0!, {r7, r8, r10, r11}
 587 | .endif
 588 | .else
 589 |     /* Process 32 bytes at a time. */
 590 | .if \overfetch == 1
 591 |     cmp     r2, #(32 + 32)
 592 | .else
 593 |     cmp     r2, #(\preload_offset + 32)
 594 | .endif
 595 |     pld     [r1, r9]
 596 |     mov     r3, ip, lsr #(\shift * 8)
 597 |     ldmia   r1!, {r4-r6, r7, r8, r10, r11, ip}
 598 |     orr     r3, r3, r4, asl #(32 - \shift * 8)
 599 |     mov     r4, r4, lsr #(\shift * 8)
 600 |     sub     r2, r2, #32
 601 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
 602 |     mov     r5, r5, lsr #(\shift * 8)
 603 | .if \block_write_size == 8
 604 |     stmia   r0!, {r3-r4}
 605 | .endif
 606 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
 607 |     mov     r6, r6, lsr #(\shift * 8)
 608 |     orr     r6, r6, r7, asl #(32 - \shift * 8)
 609 |     mov     r7, r7, lsr #(\shift * 8)
 610 | .if \block_write_size == 16
 611 |     stmia   r0!, {r3-r6}
 612 | .endif
 613 | .if \block_write_size == 8
 614 |     stmia   r0!, {r5-r6}
 615 | .endif
 616 |     orr     r7, r7, r8, asl #(32 - \shift * 8)
 617 |     mov     r8, r8, lsr #(\shift * 8)
 618 |     orr     r8, r8, r10, asl #(32 - \shift * 8)
 619 |     mov     r10, r10, lsr #(\shift * 8)
 620 | .if \block_write_size == 8
 621 |     stmia   r0!, {r7-r8}
 622 | .endif
 623 |     orr     r10, r10, r11, asl #(32 - \shift * 8)
 624 |     mov     r11, r11, lsr #(\shift * 8)
 625 |     orr     r11, r11, ip, asl #(32 - \shift * 8)
 626 | .if \block_write_size == 32
 627 |     stmia   r0!, {r3-r6, r7, r8, r10, r11}
 628 | .endif
 629 | .if \block_write_size == 16
 630 |     stmia   r0!, {r7, r8, r10, r11}
 631 | .endif
 632 | .if \block_write_size == 8
 633 |     stmia   r0!, {r10-r11}
 634 | .endif
 635 | .endif
 636 |     bge     1b
 637 | .endif /* preload_offset != 0 */
 638 | .if \overfetch == 0
 639 | 4:
 640 |     cmp     r2, #(32 + 32)
 641 |     mov     r3, ip, lsr #(\shift * 8)
 642 |     ldmia   r1!, {r4-r6, r7, r8, r10, r11, ip}
 643 |     orr     r3, r3, r4, asl #(32 - \shift * 8)
 644 |     sub     r2, r2, #32
 645 |     mov     r4, r4, lsr #(\shift * 8)
 646 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
 647 |     mov     r5, r5, lsr #(\shift * 8)
 648 | .if \block_write_size == 8
 649 |     stmia   r0!, {r3-r4}
 650 | .endif
 651 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
 652 |     mov     r6, r6, lsr #(\shift * 8)
 653 |     orr     r6, r6, r7, asl #(32 - \shift * 8)
 654 |     mov     r7, r7, lsr #(\shift * 8)
 655 | .if \block_write_size == 16
 656 |     stmia   r0!, {r3-r6}
 657 | .endif
 658 | .if \block_write_size == 8
 659 |     stmia   r0!, {r5-r6}
 660 | .endif
 661 |     orr     r7, r7, r8, asl #(32 - \shift * 8)
 662 |     mov     r8, r8, lsr #(\shift * 8)
 663 |     orr     r8, r8, r10, asl #(32 - \shift * 8)
 664 |     mov     r10, r10, lsr #(\shift * 8)
 665 | .if \block_write_size == 8
 666 |     stmia   r0!, {r7-r8}
 667 | .endif
 668 |     orr     r10, r10, r11, asl #(32 - \shift * 8)
 669 |     mov     r11, r11, lsr #(\shift * 8)
 670 |     orr     r11, r11, ip, asl #(32 - \shift * 8)
 671 | .if \block_write_size == 32
 672 |     stmia   r0!, {r3-r6, r7, r8, r10, r11}
 673 | .endif
 674 | .if \block_write_size == 16
 675 |     stmia   r0!, {r7, r8, r10, r11}
 676 | .endif
 677 | .if \block_write_size == 8
 678 |     stmia   r0!, {r10-r11}
 679 | .endif
 680 |     bge     4b
 681 | .endif /* overfetch == 0 */
 682 | 21:
 683 |     ldmfd   sp!, {r8, r9, r10, r11}
 684 | 3:  /* copy remaining data */
 685 |     tst     r2, #16
 686 |     ldmfd   sp!, {r7}
 687 |     mov     r3, ip, lsr #(\shift * 8)
 688 |     beq     1f
 689 |     ldmia   r1!, {r4-r6, ip}
 690 |     orr     r3, r3, r4, asl #(32 - \shift * 8)
 691 |     mov     r4, r4, lsr #(\shift * 8)
 692 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
 693 |     mov     r5, r5, lsr #(\shift * 8)
 694 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
 695 |     mov     r6, r6, lsr #(\shift * 8)
 696 |     orr     r6, r6, ip, asl #(32 - \shift * 8)
 697 |     stmia   r0!, {r3-r6}
 698 | 1:
 699 |     tst     r2, #8
 700 |     movne   r3, ip, lsr #(\shift * 8)
 701 |     ldmiane r1!, {r4, ip}
 702 |     orrne   r3, r3, r4, asl #(32 - \shift * 8)
 703 |     movne   r4, r4, lsr #(\shift * 8)
 704 |     orrne   r4, r4, ip, asl #(32 - \shift * 8)
 705 |     stmiane r0!, {r3-r4}
 706 | 
 707 |     tst     r2, #4
 708 |     movne   r3, ip, lsr #(\shift * 8)
 709 |     ldrne   ip, [r1], #4
 710 |     sub     r1, r1, #(4 - \shift)
 711 |     orrne   r3, r3, ip, asl #(32 - \shift * 8)
 712 |     strne   r3, [r0], #4
 713 | 
 714 | .if \granularity <= 2
 715 |     tst     r2, #2
 716 |     ldrbne  r3, [r1], #1
 717 |     ldrbne  r4, [r1], #1
 718 | .endif
 719 |     ldr     r5, [sp], #4
 720 | .if \granularity <= 2
 721 |     strbne  r3, [r0], #1
 722 |     strbne  r4, [r0], #1
 723 | .endif
 724 | 
 725 | .if \granularity == 1
 726 |     tst     r2, #1
 727 |     ldrbne  r3, [r1], #1
 728 | .endif
 729 |     ldr     r6, [sp], #4
 730 | .if \granularity == 1
 731 |     strbne  r3, [r0], #1
 732 | .endif
 733 | 
 734 |     pop     {r0, r4}
 735 | 
 736 |     bx      lr
 737 | .endm
 738 | 
 739 | 
 740 | /*
 741 |  * Macro that defines the main body of a memcpy version with optional no over-fetching
 742 |  * beyond the source memory region.
 743 |  *
 744 |  * granularity must be 1, 2 or 4. This value is 1 for normal memcpy, 2 for operations on half-word
 745 |  * aligned regions such as 16bpp framebuffers/images, and 4 for operations on word aligned regions
 746 |  * such as 32bpp framebuffers\images.
 747 |  * line_size must be 32 or 64.
 748 |  * write_align must be 32 or 16, or 64.
 749 |  * block_write_size must be 32, 16 or 8.
 750 |  * preload_offset must be a multiple of 32, 96 was the default setting. When preload_offset is 0,
 751 |  * no preload instructions will be generated at all.
 752 |  * preload_early must be 0 or 1.
 753 |  * overfetch must be 0 or 1.
 754 |  *
 755 |  * If line_size is 64, write_align must be 32 or 64, block_write_size must be 32, and preload_offset
 756 |  * must be a multiple of 64.
 757 |  *
 758 |  * If line_size is 64 or write_align is 64, overfetch must be 0.
 759 |  */
 760 | 
 761 | .macro MEMCPY_VARIANT granularity, line_size, write_align, block_write_size, preload_offset, preload_early, overfetch
 762 |     cmp     r2, #52
 763 |     bic     r3, r1, #(\line_size - 1)
 764 | .if \preload_early == 1
 765 |     pld     [r3]
 766 | .endif
 767 |     /* Jump if we have a large size. */
 768 |     bge     1f
 769 | 
 770 | .if \granularity <= 2
 771 |     /*
 772 |      * Small sizes. Test whether both source and destination are word aligned.
 773 |      */
 774 |     tst    r0, #3
 775 |     andseq r3, r1, #3
 776 |     /* If not, jump to the unaligned code for small sizes */
 777 |     mov    ip, r0
 778 |     bne    9f
 779 | .else
 780 |     mov    ip, r0
 781 | .endif
 782 | 
 783 |     /* Copy words. Fast path for small sizes with word aligned src and dest. */
 784 |     /* ip must be equal to the original r0. */
 785 | 29:
 786 | 22:
 787 |     cmp    r2, #8
 788 |     ldrge  r3, [r1], #4
 789 |     strge  r3, [r0], #4
 790 |     ldrge  r3, [r1], #4
 791 |     sub    r2, r2, #8
 792 |     strge  r3, [r0], #4
 793 |     bgt    22b
 794 |     moveq  r0,ip
 795 |     bxeq   lr
 796 |     tst    r2, #4
 797 |     ldrne  r3, [r1], #4
 798 |     strne  r3, [r0], #4
 799 |     tst    r2, #3
 800 |     moveq  r0, ip
 801 |     bxeq   lr
 802 | .if \granularity <= 2
 803 |     tst    r2, #2
 804 |     ldrhne r3, [r1], #2
 805 |     strhne r3, [r0], #2
 806 | .endif
 807 | .if \granularity == 1
 808 |     tst    r2, #1
 809 |     ldrbne r3, [r1]
 810 |     strbne r3, [r0]
 811 | .endif
 812 |     mov    r0, ip
 813 |     bx     lr
 814 | 
 815 | 1:
 816 |     /*
 817 |      * Larger sizes. Copy data until destination address is 4 bytes aligned.
 818 |      * Optimize the common case in which both source and destination are
 819 |      * are already word-aligned.
 820 |      */
 821 | .if \granularity == 1
 822 |     tst     r0, #3
 823 |     stmfd   sp!, {r0, r4}
 824 |     andseq  r3, r1, #3
 825 |     stmfd   sp!, {r5, r6}
 826 | #ifdef CHECK_EARLY_PRELOADS
 827 | .if \preload_early == 1
 828 |     /* Determine the 32-byte aligned address of the last byte. */
 829 |     addeq   r6, r1, r2
 830 | .endif
 831 | #endif
 832 |     beq 2f
 833 | .else
 834 |     stmfd   sp!, {r0, r4}
 835 |     stmfd   sp!, {r5, r6}
 836 | .endif
 837 | 
 838 | .if \granularity == 1
 839 |     tst     r0, #1
 840 |     ldrbne  r4, [r1], #1
 841 |     subne   r2, r2, #1
 842 |     strbne  r4, [r0], #1
 843 | .endif
 844 | 
 845 | .if \granularity <= 2
 846 |     tst     r0, #2
 847 |     ldrbne  r4, [r1], #1
 848 | .endif
 849 | .if \granularity <= 2
 850 |     ldrbne  r5, [r1], #1
 851 |     subne   r2, r2, #2
 852 |     orrne   r4, r4, r5, asl #8
 853 | .endif
 854 | #ifdef CHECK_EARLY_PRELOADS
 855 | .if \preload_early == 1
 856 |     /* Determine the 32-byte aligned address of the last byte. */
 857 |     add     r6, r1, r2
 858 | .endif
 859 | #endif
 860 | .if \granularity <= 2
 861 |     strhne  r4, [r0], #2
 862 | .endif
 863 |     /* destination address is 4 bytes aligned */
 864 | 
 865 | .if \granularity == 1
 866 |     tst     r1, #1
 867 | .endif
 868 | #ifdef CHECK_EARLY_PRELOADS
 869 | .if \preload_early == 1
 870 |     sub     r6, r6, #1
 871 | .endif
 872 | #endif
 873 | #ifdef CHECK_EARLY_PRELOADS
 874 | .if \preload_early == 1
 875 |     bic     r6, r6, #(\line_size - 1)
 876 | .endif
 877 | #endif
 878 | 
 879 |     /* now we should handle 4 cases of source address alignment */
 880 | .if \granularity == 1
 881 |     bne     6f
 882 | .endif
 883 | .if \granularity <= 2
 884 |     tst     r1, #2
 885 | .endif
 886 |     stmfd   sp!, {r7}
 887 | .if \granularity <= 2
 888 |     bne     7f
 889 | .endif
 890 |     tst     r0, #4
 891 |     b       3f
 892 | 
 893 | 2:
 894 |     /* Further optimize for the 16-byte aligned case. */
 895 |     tst     r0, #12
 896 | #ifdef CHECK_EARLY_PRELOADS
 897 | .if \preload_early == 1
 898 |     sub     r6, r6, #1
 899 | .endif
 900 | #endif
 901 | .if \preload_early == 1
 902 |     pld     [r3, #\line_size]
 903 | .endif
 904 | #ifdef CHECK_EARLY_PRELOADS
 905 | .if \preload_early == 1
 906 |     bic     r6, r6, #(\line_size - 1)
 907 | .endif
 908 | #endif
 909 |     stmfd   sp!, {r7}
 910 |     beq     1f
 911 |     tst     r0, #4
 912 | 
 913 | 3:
 914 |     /* both source and destination are 4 bytes aligned */
 915 | #ifdef CHECK_EARLY_PRELOADS
 916 | .if \preload_early == 1
 917 |     mov     ip, r6
 918 | .endif
 919 | #endif
 920 |     ldrne   r5, [r1], #4
 921 |     subne   r2, r2, #4
 922 |     strne   r5, [r0], #4
 923 |     tst     r0, #8
 924 |     ldmiane r1!, {r4, r5}
 925 |     subne   r2, r2, #8
 926 |     stmiane r0!, {r4, r5}
 927 | 1:
 928 | .if \write_align >= 32
 929 |     tst     r0, #16
 930 |     ldmiane r1!, {r4-r7}
 931 |     subne   r2, r2, #16
 932 |     stmiane r0!, {r4-r7}
 933 | .endif
 934 | .if \write_align == 64
 935 |     tst     r0, #32
 936 |     ldmiane r1!, {r4-r7}
 937 |     subne   r2, r2, #32
 938 |     stmiane r0!, {r4-r7}
 939 |     ldmiane r1!, {r4-r7}
 940 |     stmiane r0!, {r4-r7}
 941 | .endif
 942 |     /* Source is now write_align bytes aligned. */
 943 | 
 944 |     /*
 945 |      * The chunk size is defined is 64 if write_align == 64 or line_size = 64;
 946 |      * otherwise, it is equal to write_align.
 947 |      * If the number of bytes left is smaller than the chunk size, skip all loops.
 948 |      * If the number of bytes left is <= (preload_offset + chunk_size), skip the 
 949 |      * loop with preload and jump to the loop without preload.
 950 |      * Also calculate the preload offset in r9 and the address of the next main loop preload
 951 |      * in r5 if early preload is enabled and PRELOAD_CATCH_UP is set.
 952 |      * If preload is enabled, r3 is updated to hold the address of the next early preload.
 953 |      */
 954 | .if \preload_offset == 0
 955 |     cmp     r2, #32
 956 |     blt     14f
 957 |     stmfd   sp!, {r8, r9, r10, r11}
 958 | .elseif \write_align == 64 || \line_size == 64
 959 |     cmp     r2, #64
 960 | .if \line_size == 64 && \write_align == 32
 961 |     add     r5, r1, #\preload_offset
 962 | .endif
 963 | .if \preload_early == 1
 964 |     pld     [r3, #(\line_size * 2)]
 965 | #ifdef PRELOAD_CATCH_UP
 966 |     add     r3, #(\line_size * 3)
 967 | #endif
 968 | .endif
 969 | .if \line_size == 64 && \write_align == 32
 970 |     bic     r5, r5, #63
 971 | .else
 972 | #ifdef PRELOAD_CATCH_UP
 973 | .if \preload_early == 1
 974 |     add     r5, r1, #\preload_offset
 975 | .endif
 976 | #endif
 977 | .endif
 978 |     blt     2f
 979 |     cmp     r2, #(\preload_offset + 64)
 980 |     stmfd   sp!, {r8, r9, r10, r11}
 981 | .if \line_size == 64 && \write_align == 32
 982 |     sub     r9, r5, r1
 983 | .else
 984 |     mov     r9, #\preload_offset
 985 | .endif
 986 | .if \overfetch == 1
 987 |     ble     1f
 988 | .else
 989 |     ble     10f
 990 | .endif
 991 | .elseif \write_align == 32
 992 |     /* In the case of line_size == 32 and write_align == 32 r9 will be equal to preload_offset. */
 993 |     cmp     r2, #32
 994 | .if \preload_early == 1
 995 |     pld     [r3, #(\line_size * 2)]
 996 | #ifdef PRELOAD_CATCH_UP
 997 |     add     r3, #(\line_size * 3)
 998 | #endif
 999 | .endif
1000 | #ifdef PRELOAD_CATCH_UP
1001 | .if \preload_early == 1
1002 |     add     r5, r1, #\preload_offset
1003 | .endif
1004 | #endif
1005 |     blt     14f
1006 |     cmp     r2, #(\preload_offset + 32)
1007 |     stmfd   sp!, {r8, r9, r10, r11}
1008 |     mov     r9, #\preload_offset
1009 | .if \overfetch == 1
1010 |     ble     1f
1011 | .else
1012 |     ble     10f
1013 | .endif
1014 | .else // write_align == 16
1015 |     cmp     r2, #32
1016 |     add     r5, r1, #\preload_offset
1017 | .if \preload_early == 1
1018 |     pld     [r3, #(\line_size * 2)]
1019 | #ifdef PRELOAD_CATCH_UP
1020 |     add     r3, #(\line_size * 3)
1021 | #endif
1022 | .endif
1023 |     bic     r5, r5, #31
1024 |     /* If there are less than 32 bytes to go, skip all loops. */
1025 |     blt     14f
1026 |     cmp     r2, #(\preload_offset + 32)
1027 |     stmfd   sp!, {r8, r9, r10, r11}
1028 |     sub     r9, r5, r1
1029 |     /* If there are <= (preload_offset + 32) bytes to go, skip the main loop. */
1030 | .if \overfetch == 1
1031 |     ble     1f
1032 | .else
1033 |     ble     10f
1034 | .endif
1035 | .endif
1036 | 
1037 | .if \preload_offset != 0
1038 | .if \preload_early == 1
1039 | #ifndef PRELOAD_CATCH_UP
1040 |     pld     [r3, #(\line_size * 3)]
1041 | #else
1042 | .if \block_write_size >= 16 && \preload_offset >= 96
1043 |     /*
1044 |      * The last preload already done is at [r3 - line_size].
1045 |      * The next preload in the main loop will happen at [r5 + line_size].
1046 |      * If there are line-sized chunks in between that we have not yet preloaded,
1047 |      * we want to do preloads for them.
1048 |      */
1049 |     cmp     r3, r5
1050 |     bge     1f
1051 | #if 0
1052 |     /* Implement catch-up using a simple loop. */
1053 |     add     r3, r3, #\line_size
1054 | 13:
1055 |     cmp     r3, r5
1056 |     pld     [r3, #-\line_size]
1057 |     add     r3, r3, #\line_size
1058 |     blt     13b
1059 | #else
1060 |     /*
1061 |      * Implement catch-up while processing chunks. block_write_size of 32
1062 |      * uses 16-byte writes because of a lack of registers.
1063 |      * Note: if line_size is 64 and write alignment is 64, we have to be 
1064 |      * careful that write alignment remains 64 bytes.
1065 |      */
1066 |     pld     [r3]
1067 |     add     r3, r3, #\line_size
1068 | 13:
1069 |     cmp     r3, r5
1070 |     ldmialt r1!, {r7, r8, r10, r11}
1071 |     addlt   r5, r5, #32
1072 |     bge     1f
1073 | .if \line_size == 64 || \write_align == 64
1074 |     cmp     r2, #(\preload_offset + 64 + 32)
1075 | .else
1076 |     cmp     r2, #(\preload_offset + 32 + 32)
1077 | .endif
1078 |     stmia   r0!, {r7, r8, r10, r11}
1079 |     pld     [r3]
1080 |     add     r3, r3, #64
1081 |     ldmia   r1!, {r7, r8, r10, r11}
1082 |     sub     r2, r2, #32
1083 |     stmia   r0!, {r7, r8, r10, r11}
1084 | .if \line_size == 32
1085 |     pld     [r3, #-\line_size]
1086 | .endif
1087 | .if \line_size != 64 || \write_align != 64
1088 |     bgt     13b
1089 | .if \overfetch == 0
1090 |     b       10f
1091 | .endif
1092 | .else
1093 |     /*
1094 |      * If line_size is 64 and write_align is 64, make sure
1095 |      * the write alignment of 64 maintained.
1096 |      *
1097 |      * Jump if we don't need to do preloads anymore; 64-byte write
1098 |      * alignment is not important in this case.
1099 |      */
1100 |     add     r5, r5, #32
1101 |     ble     10f
1102 |     cmp     r3, r5
1103 |     ldmia   r1!, {r7, r8, r10, r11}
1104 |     /* In case of a jump, we will be doing more preloads so we */
1105 |     /* have to ensure 64 bytes write alignment. */
1106 |     bge     5f
1107 |     cmp     r2, #(\preload_offset + 64 + 32)
1108 |     stmia   r0!, {r7, r8, r10, r11}
1109 |     pld     [r3]
1110 |     add     r3, r3, #64
1111 |     ldmia   r1!, {r7, r8, r10, r11}
1112 |     sub     r2, r2, #32
1113 |     stmia   r0!, {r7, r8, r10, r11}
1114 |     bgt     13b
1115 | .if \overfetch == 0
1116 |     b       10f
1117 | .endif
1118 | .endif /* line_size == 64 write_alignment == 64 */
1119 | #endif
1120 | .else
1121 |     pld     [r3]
1122 | .endif
1123 | #endif
1124 | .endif /* preload_early == 1 */
1125 | 1:
1126 | .if \line_size == 64 || \write_align == 64
1127 | .if \overfetch == 1
1128 |     cmp     r2, #(64 + 64)
1129 | .else
1130 |     cmp     r2, #(\preload_offset + 64 + 64)
1131 | .endif
1132 |     ldmia   r1!, {r3-r6, r7, r8, r10, r11}
1133 |     stmia   r0!, {r3-r6, r7, r8, r10, r11}
1134 | .if \line_size == 32
1135 |     pld     [r1, r9]
1136 | .endif
1137 |     ldmia   r1!, {r3-r6, r7, r8, r10, r11}
1138 |     sub     r2, r2, #64
1139 |     stmia   r0!, {r3-r6, r7, r8, r10, r11}
1140 |     pld     [r1, r9]
1141 | .else
1142 | .if \overfetch == 1
1143 |     cmp     r2, #(32 + 32)
1144 | .else
1145 |     cmp     r2, #(\preload_offset + 32 + 32)
1146 | .endif
1147 | .if \block_write_size == 32
1148 |     ldmia   r1!, {r3-r6, r7, r8, r10, r11}
1149 |     sub     r2, r2, #32
1150 |     stmia   r0!, {r3-r6, r7, r8, r10, r11}
1151 |     pld     [r1, r9]
1152 | .endif
1153 | .if \block_write_size == 16
1154 |     ldmia   r1!, {r3-r6, r7, r8, r10, r11}
1155 |     sub     r2, r2, #32
1156 |     stmia   r0!, {r3-r6}
1157 |     pld     [r1, r9]
1158 |     stmia   r0!, {r7, r8, r10, r11}
1159 | .endif
1160 | .if \block_write_size == 8
1161 |     ldmia   r1!, {r3-r6, r7, r8, r10, r11}
1162 |     sub     r2, r2, #32
1163 |     stmia   r0!, {r3-r4}
1164 |     stmia   r0!, {r5-r6}
1165 |     pld     [r1, r9]
1166 |     stmia   r0!, {r7-r8}
1167 |     stmia   r0!, {r10-r11}
1168 | .endif
1169 | .endif /* line_size ==  64 */
1170 |     bge     1b
1171 | .endif /* preload_offset != 0 */
1172 | .if \overfetch == 0
1173 | 10:
1174 | .if \line_size == 64 || \write_align == 64
1175 |     cmp     r2, #(64 + 64)
1176 |     ldmia   r1!, {r3-r6, r7, r8, r10, r11}
1177 |     sub     r2, r2, #64
1178 |     stmia   r0!, {r3-r6, r7, r8, r10, r11}
1179 |     ldmia   r1!, {r3-r6, r7, r8, r10, r11}
1180 |     stmia   r0!, {r3-r6, r7, r8, r10, r11}
1181 | .else
1182 | .if \block_write_size == 32
1183 |     cmp     r2, #(32 + 32)
1184 |     ldmia   r1!, {r3-r6, r7, r8, r10, r11}
1185 |     sub     r2, r2, #32
1186 |     stmia   r0!, {r3-r6, r7, r8, r10, r11}
1187 | .endif
1188 | .if \block_write_size == 16
1189 |     cmp     r2, #(32 + 32)
1190 |     ldmia   r1!, {r3-r6, r7, r8, r10, r11}
1191 |     sub     r2, r2, #32
1192 |     stmia   r0!, {r3-r6}
1193 |     stmia   r0!, {r7, r8, r10, r11}
1194 | .endif
1195 | .if \block_write_size == 8
1196 |     cmp     r2, #(32 + 32)
1197 |     ldmia   r1!, {r3-r6, r7, r8, r10, r11}
1198 |     sub     r2, r2, #32
1199 |     stmia   r0!, {r3-r4}
1200 |     stmia   r0!, {r5-r6}
1201 |     stmia   r0!, {r7-r8}
1202 |     stmia   r0!, {r10-r11}
1203 | .endif
1204 | .endif /* line_size == 64 || write_align == 64  */
1205 |     bge     10b
1206 | .endif /* overfetch == 0 */
1207 |     ldmfd   sp!, {r8, r9, r10, r11}
1208 | 2:
1209 | .if \line_size == 64 || \write_align == 64
1210 |     tst     r2, #32
1211 |     ldmiane r1!, {r3-r6}
1212 |     stmiane r0!, {r3-r6}
1213 |     ldmiane r1!, {r3-r6}
1214 |     stmiane r0!, {r3-r6}
1215 | .endif
1216 | 14:
1217 |     tst     r2, #16
1218 |     ldmfd   sp!, {r7}
1219 |     ldmiane r1!, {r3-r6}
1220 |     stmiane r0!, {r3-r6}
1221 | 3:
1222 |     tst     r2, #8
1223 |     ldmiane r1!, {r3-r4}
1224 |     stmiane r0!, {r3-r4}
1225 |     tst     r2, #4
1226 |     ldrne   r3, [r1], #4
1227 |     mov     ip, r0
1228 |     strne   r3, [ip], #4
1229 | .if \granularity == 1
1230 |     /* Optimize for the word-sized case. */
1231 |     tst     r2, #3
1232 |     ldmfdeq sp!, {r5, r6}
1233 |     ldmfdeq sp!, {r0, r4}
1234 |     bxeq    lr
1235 | .endif
1236 | .if \granularity <= 2
1237 |     tst     r2, #2
1238 |     ldrhne  r3, [r1], #2
1239 | .endif
1240 |     ldmfd   sp!, {r5, r6}
1241 | .if \granularity <= 2
1242 |     strhne  r3, [ip], #2
1243 | .endif
1244 | .if \granularity == 1
1245 |     tst     r2, #1
1246 |     ldrbne  r3, [r1]
1247 | .endif
1248 |     ldmfd   sp!, {r0, r4}
1249 | .if \granularity == 1
1250 |     strbne  r3, [ip]
1251 | .endif
1252 |     bx      lr
1253 | 5:
1254 |     /* We get here in case we need to fix write alignment to 64 bytes. */
1255 |     stmia   r0!, {r7, r8, r10, r11}
1256 |     ldmia   r1!, {r7, r8, r10, r11}
1257 |     sub     r2, r2, #32
1258 |     stmia   r0!, {r7, r8, r10, r11}
1259 |     b       1b
1260 | .if \granularity == 1
1261 | 6:
1262 |     tst    r1, #2
1263 |     stmfd   sp!, {r7}
1264 |     bne    8f
1265 |     UNALIGNED_MEMCPY_VARIANT \granularity, 1, \line_size, \write_align, \block_write_size, \preload_offset, \preload_early, \overfetch
1266 | .endif
1267 | 7:
1268 |     UNALIGNED_MEMCPY_VARIANT \granularity, 2, \line_size, \write_align, \block_write_size, \preload_offset, \preload_early, \overfetch
1269 | .if \granularity == 1
1270 | 8:
1271 |     UNALIGNED_MEMCPY_VARIANT \granularity, 3, \line_size, \write_align, \block_write_size, \preload_offset, \preload_early, \overfetch
1272 | .endif
1273 | 
1274 |     .p2align 4
1275 | .if \granularity <= 2
1276 | 9:
1277 |     cmp     r2, #8
1278 |     push    {r0}
1279 |     blt     1f               /* Jump to special case for really small sizes. */
1280 |     
1281 |     /* copy data until destination address is 4 bytes aligned */
1282 | .if \granularity == 1
1283 |     tst     r0, #1
1284 |     ldrbne  r3, [r1], #1
1285 |     subne   r2, r2, #1
1286 |     strbne  r3, [r0], #1
1287 | .endif
1288 | 
1289 |     tst     r0, #2
1290 |     ldrbne  r3, [r1], #1
1291 |     ldrbne  ip, [r1], #1
1292 |     subne   r2, r2, #2
1293 |     orrne   r3, r3, ip, asl #8
1294 |     strhne  r3, [r0], #2
1295 |     /* destination address is 4 bytes aligned */
1296 | 
1297 |     /* now we should handle four cases of source address alignment */
1298 | .if \granularity == 1
1299 |     tst     r1, #1
1300 |     bne     25f
1301 | .endif
1302 |     tst     r1, #2
1303 |     popeq   {ip}
1304 |     beq     29b               /* Jump if the source is word aligned. */
1305 | 
1306 |     /* shift 2 */
1307 | //    sub     r1, r1, #2
1308 | //    ldr     ip, [r1], #4
1309 |     ldr     ip, [r1, #-2]
1310 |     add     r1, r1, #2
1311 | 23:
1312 |     subs    r2, r2, #4
1313 |     movge   r3, ip, lsr #(2 * 8)
1314 |     ldrge   ip, [r1], #4
1315 |     orrge   r3, r3, ip, asl #(32 - 2 * 8)
1316 |     strge   r3, [r0], #4
1317 |     bge     23b
1318 | 
1319 |     sub     r1, r1, #2
1320 |     tst     r2, #2
1321 |     ldrbne  r3, [r1], #1
1322 |     ldrbne  ip, [r1], #1
1323 |     strbne  r3, [r0], #1
1324 |     strbne  ip, [r0], #1
1325 | 
1326 | .if \granularity == 1
1327 |     tst     r2, #1
1328 |     mov     ip, r0
1329 |     ldrbne  r3, [r1]
1330 |     ldr     r0, [sp], #4
1331 |     strbne  r3, [ip]
1332 | .else
1333 |     pop     {r0}
1334 | .endif
1335 |     bx      lr
1336 | 
1337 |     /* Handle sizes < 8 */
1338 | 1:
1339 | .if \granularity == 2
1340 |     tst     r2, #4
1341 |     ldrhne  r3, [r1], #2
1342 |     ldrhne  ip, [r1], #2
1343 |     strhne  r3, [r0], #2
1344 |     strhne  ip, [r0], #2
1345 |     test    r2, #2
1346 |     mov     ip, r0
1347 |     ldrhne  r3, [r1]
1348 |     pop     {r0}
1349 |     strhne  r3, [ip]
1350 | .else
1351 |     tst     r2, #4
1352 |     ldrbne  r3, [r1], #1
1353 |     beq     2f
1354 |     ldrb    ip, [r1], #1
1355 |     strb    r3, [r0], #1
1356 |     strb    ip, [r0], #1
1357 |     ldrb    r3, [r1], #1
1358 |     ldrb    ip, [r1], #1
1359 |     strb    r3, [r0], #1
1360 |     strb    ip, [r0], #1
1361 | 2:
1362 |     tst     r2, #2
1363 |     ldrbne  r3, [r1], #1
1364 |     ldrbne  ip, [r1], #1
1365 |     strbne  r3, [r0], #1
1366 |     strbne  ip, [r0], #1
1367 |     tst     r2, #1
1368 |     mov     ip, r0
1369 |     ldrbne  r3, [r1]
1370 |     pop     {r0}
1371 |     strbne  r3, [ip]
1372 | .endif
1373 |     bx      lr
1374 | 
1375 | .if \granularity == 1
1376 | 24:
1377 |     /* shift 1 */
1378 | //    sub     r1, r1, #1
1379 | //    ldr     ip, [r1], #4
1380 |     ldr     ip, [r1, #-1]
1381 |     add     r1, r1, #3
1382 | 27:
1383 |     subs    r2, r2, #4
1384 |     movge   r3, ip, lsr #(1 * 8)
1385 |     ldrge   ip, [r1], #4
1386 |     orrge   r3, r3, ip, asl #(32 - 1 * 8)
1387 |     strge   r3, [r0], #4
1388 |     bge     27b
1389 | 
1390 |     sub     r1, r1, #3
1391 |     tst     r2, #2
1392 |     ldrbne  r3, [r1], #1
1393 |     ldrbne  ip, [r1], #1
1394 |     strbne  r3, [r0], #1
1395 |     strbne  ip, [r0], #1
1396 | 
1397 |     tst     r2, #1
1398 |     mov     ip, r0
1399 |     ldrbne  r3, [r1]
1400 |     ldr     r0, [sp], #4
1401 |     strbne  r3, [ip]
1402 |     bx      lr
1403 | 
1404 | 25:
1405 |     tst     r1, #2
1406 |     beq     24b          /* shift 1 */
1407 | 
1408 |     /* shift 3 */
1409 | 26:
1410 | //    sub     r1, r1, #3
1411 | //    ldr     ip, [r1], #4
1412 |     ldr     ip, [r1, #-3]
1413 |     add     r1, r1, #1
1414 | 28:
1415 |     subs    r2, r2, #4
1416 |     movge   r3, ip, lsr #(3 * 8)
1417 |     ldrge   ip, [r1], #4
1418 |     orrge   r3, r3, ip, asl #(32 - 3 * 8)
1419 |     strge   r3, [r0], #4
1420 |     bge     28b
1421 | 
1422 |     sub     r1, r1, #1
1423 |     tst     r2, #2
1424 |     ldrbne  r3, [r1], #1
1425 |     ldrbne  ip, [r1], #1
1426 |     strbne  r3, [r0], #1
1427 |     strbne  ip, [r0], #1
1428 | 
1429 |     tst     r2, #1
1430 |     mov     ip, r0
1431 |     ldrbne  r3, [r1]
1432 |     ldr     r0, [sp], #4
1433 |     strbne  r3, [ip]
1434 |     bx      lr
1435 | .endif /* granularity == 1 */
1436 | .endif /* granularity <= 2 */
1437 | 
1438 | .endm
1439 | 
1440 | /*
1441 |  * The following macros implement a simpler memcpy that is optimized with a fast path
1442 |  * for common cases and may use unaligned access for small sizes.
1443 |  *
1444 |  * line_size of 64 or 32 is supported, write_align must be 32 or 16, block_write_size
1445 |  * must be 32 or 16, early_preload and overfetch are enabled.
1446 |  */
1447 | 
1448 | .macro MEMCPY_VARIANT_SIMPLE_WRITE_ALIGN write_align
1449 |     /* Align to a 16-byte or 32-byte boundary. */
1450 |     tst     r0, #4
1451 |     ldrne   r3, [r1], #4
1452 |     subne   r2, r2, #4
1453 |     strne   r3, [r0], #4
1454 |     tst     r0, #8
1455 |     ldrne   r3, [r1], #4
1456 |     ldrne   ip, [r1], #4
1457 |     subne   r2, r2, #8
1458 |     strne   r3, [r0], #4
1459 |     strne   ip, [r0], #4
1460 | .if \write_align >= 32
1461 |     tst     r0, #16
1462 |     ldrne   r3, [r1]
1463 |     beq     31f
1464 |     ldr     ip, [r1, #4]
1465 |     str     r3, [r0]
1466 |     sub     r2, r2, #16
1467 |     str     ip, [r0, #4]
1468 |     ldr     r3, [r1, #8]
1469 |     ldr     ip, [r1, #12]
1470 |     add     r1, #16
1471 |     str     r3, [r0, #8]
1472 |     str     ip, [r0, #12]
1473 |     add     r0, #16
1474 | 31:
1475 | .endif
1476 | .if \write_align == 64
1477 |     tst     r0, #32
1478 |     ldmiane r1!, {r3, ip}
1479 |     beq     32f
1480 |     stmia   r0!, {r3, ip}
1481 |     ldmia   r1!, {r3, ip}
1482 |     stmia   r0!, {r3, ip}
1483 |     ldmia   r1!, {r3, ip}
1484 |     stmia   r0!, {r3, ip}
1485 |     ldmia   r1!, {r3, ip}
1486 |     sub     r2, r2, #32
1487 |     stmia   r0!, {r3, ip}
1488 | 32:
1489 | .endif
1490 | .endm
1491 | 
1492 | .macro MEMCPY_VARIANT_SIMPLE_WRITE_ALIGN_CUSTOM
1493 |     /*
1494 |      * Align destination to a 16-byte or 32-byte boundary,
1495 |      * depending on whether the 32-byte alignment of the
1496 |      * source is optimal.
1497 |      */
1498 |     tst     r0, #4
1499 |     ldrne   r3, [r1], #4
1500 |     subne   r2, r2, #4
1501 |     strne   r3, [r0], #4
1502 |     tst     r0, #8
1503 |     ldrne   r3, [r1], #4
1504 |     ldrne   ip, [r1], #4
1505 |     subne   r2, r2, #8
1506 |     strne   r3, [r0], #4
1507 |     strne   ip, [r0], #4
1508 |     /*
1509 |      * If (source & 16) is zero, allow write aligning to 32 bytes.
1510 |      * This improves performance.
1511 |      */
1512 |     eor     r3, r1, r0
1513 |     tst     r0, #16
1514 |     tstne   r3, #16
1515 |     ldrne   r3, [r1]
1516 |     beq     31f
1517 |     ldr     ip, [r1, #4]
1518 |     str     r3, [r0]
1519 |     sub     r2, r2, #16
1520 |     str     ip, [r0, #4]
1521 |     ldr     r3, [r1, #8]
1522 |     ldr     ip, [r1, #12]
1523 |     add     r1, #16
1524 |     str     r3, [r0, #8]
1525 |     str     ip, [r0, #12]
1526 |     add     r0, #16
1527 | 31:
1528 | .endm
1529 | 
1530 | .macro MEMCPY_VARIANT_SIMPLE_UNALIGNED_MAIN_PART shift, line_size, write_align, block_write_size, preload_offset, custom_write_align
1531 |     ldr     ip, [r1, #(-\shift)]
1532 |     add     r1, r1, #(4 - \shift)
1533 | 
1534 |     tst     r0, #4
1535 |     push    {r4-r6}
1536 |     movne   r3, ip, lsr #(\shift * 8)
1537 |     ldrne   ip, [r1], #4
1538 |     subne   r2, r2, #4
1539 |     orrne   r3, r3, ip, asl #(32 - \shift * 8)
1540 |     strne   r3, [r0], #4
1541 | 
1542 |     tst     r0, #8
1543 |     movne   r3, ip, lsr #(\shift * 8)
1544 |     ldmiane r1!, {r4, ip}
1545 |     subne   r2, r2, #8
1546 |     orrne   r3, r3, r4, asl #(32 - \shift * 8)
1547 |     movne   r4, r4, lsr #(\shift * 8)
1548 |     orrne   r4, r4, ip, asl #(32 - \shift * 8)
1549 |     stmiane r0!, {r3-r4}
1550 | 
1551 | .if \write_align == 32
1552 | .if \custom_write_align == 1
1553 |     eor     r3, r1, r0
1554 |     tst     r0, #16
1555 |     tstne   r3, #16
1556 | .else
1557 |     tst     r0, #16
1558 | .endif
1559 |     movne   r3, ip, lsr #(\shift * 8)
1560 |     beq     25f
1561 |     ldmia   r1!, {r4-r6, ip}
1562 |     sub     r2, r2, #16
1563 |     orr     r3, r3, r4, asl #(32 - \shift * 8)
1564 |     mov     r4, r4, lsr #(\shift * 8)
1565 | .if (68 - (\write_align - 1)) < 32
1566 |     cmp     r2, #32
1567 | .endif
1568 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
1569 |     mov     r5, r5, lsr #(\shift * 8)
1570 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
1571 |     mov     r6, r6, lsr #(\shift * 8)
1572 |     orr     r6, r6, ip, asl #(32 - \shift * 8)
1573 |     stmia   r0!, {r3-r6}
1574 | .if (68 - (\write_align - 1)) < 32
1575 |     blt     22f
1576 |     b       26f
1577 | .endif
1578 | 25:
1579 | .endif
1580 | 
1581 |     /*
1582 |      * We don't need a check if the number of bytes left is guaranteed to
1583 |      * be >= 32.
1584 |      */
1585 | .if (68 - (\write_align - 1)) < 32
1586 |     cmp     r2, #32
1587 |     blt     22f
1588 | .endif
1589 | 26:
1590 | .if \write_align == \line_size && 0
1591 |     push    {r7-r11}
1592 |     mov     r9, #\preload_offset
1593 |     sub     r2, r2, #32
1594 | .else
1595 |     add     r3, r1, #\preload_offset
1596 |     push    {r7-r11}
1597 |     bic     r3, r3, #(\line_size - 1)
1598 |     sub     r2, r2, #32
1599 |     sub     r9, r3, r1
1600 | .endif
1601 |     /*
1602 |      * Main loop for unaligned copy. Process 32 bytes at a time.
1603 |      */
1604 | 21:
1605 |     pld     [r1, r9]
1606 |     mov     r3, ip, lsr #(\shift * 8)
1607 |     ldmia   r1!, {r4-r6, r7, r8, r10, r11, ip}
1608 |     orr     r3, r3, r4, asl #(32 - \shift * 8)
1609 |     mov     r4, r4, lsr #(\shift * 8)
1610 |     subs    r2, r2, #32
1611 |     orr     r4, r4, r5, asl #(32 - \shift * 8)
1612 |     mov     r5, r5, lsr #(\shift * 8)
1613 |     orr     r5, r5, r6, asl #(32 - \shift * 8)
1614 |     mov     r6, r6, lsr #(\shift * 8)
1615 |     orr     r6, r6, r7, asl #(32 - \shift * 8)
1616 |     mov     r7, r7, lsr #(\shift * 8)
1617 | .if \block_write_size == 16
1618 |     stmia   r0!, {r3-r6}
1619 | .endif
1620 |     orr     r7, r7, r8, asl #(32 - \shift * 8)
1621 |     mov     r8, r8, lsr #(\shift * 8)
1622 |     orr     r8, r8, r10, asl #(32 - \shift * 8)
1623 |     mov     r10, r10, lsr #(\shift * 8)
1624 |     orr     r10, r10, r11, asl #(32 - \shift * 8)
1625 |     mov     r11, r11, lsr #(\shift * 8)
1626 |     orr     r11, r11, ip, asl #(32 - \shift * 8)
1627 | .if \block_write_size == 32
1628 |     stmia   r0!, {r3-r6, r7, r8, r10, r11}
1629 | .endif
1630 | .if \block_write_size == 16
1631 |     stmia   r0!, {r7, r8, r10, r11}
1632 | .endif
1633 |     bge     21b
1634 |     adds    r2, r2, #32
1635 |     pop     {r7-r11}
1636 |     popeq   {r4-r6}
1637 |     popeq   {r0}
1638 |     bxeq    lr
1639 | 22:
1640 |     pop     {r4-r6}
1641 | 23:
1642 |     subs    r2, r2, #4
1643 |     movge   r3, ip, lsr #(\shift * 8)
1644 |     ldrge   ip, [r1], #4
1645 |     orrge   r3, r3, ip, asl #(32 - \shift * 8)
1646 |     strge   r3, [r0], #4
1647 |     bgt     23b
1648 | 
1649 | 24:
1650 |     sub     r1, r1, #(4 - \shift)
1651 | .endm
1652 | 
1653 | 
1654 | .macro MEMCPY_VARIANT_SIMPLE granularity, line_size, write_align, block_write_size, \
1655 | preload_offset, preload_catch_up, preload_early, overfetch, custom_write_align, \
1656 | check_small_size_alignment
1657 |     cmp     r2, #68
1658 | .if \preload_early == 1
1659 |     bic     r3, r1, #(\line_size - 1)
1660 | .endif
1661 |     mov     ip, r0
1662 | .if \preload_early == 1
1663 |     pld     [r3]
1664 | .endif
1665 |     bge     1f
1666 | 
1667 |     /*
1668 |      * Path for sizes < 68 bytes; don't care about unaligned access
1669 |      * except if both the source and destination are unaligned and
1670 |      * the number of bytes is > 32. This checks costs a few percent
1671 |      * performance for the common word aligned-case.
1672 |      */
1673 | .if \check_small_size_alignment == 1
1674 | .if \granularity <= 2
1675 |     /* This assumes lt flag is set. */
1676 |     tst     r0, #3
1677 |     tstne   r1, #3
1678 |     cmpne   r2, #32
1679 |     bgt     2f
1680 | .endif
1681 | .endif
1682 | 3:
1683 |     tst    r2, #4
1684 |     ldrne  r3, [r1], #4
1685 |     subne  r2, r2, #4
1686 |     strne  r3, [r0], #4
1687 | 4:
1688 |     cmp    r2, #8
1689 |     ldrge  r3, [r1], #4
1690 |     strge  r3, [r0], #4
1691 |     ldrge  r3, [r1], #4
1692 |     subge  r2, r2, #8
1693 |     strge  r3, [r0], #4
1694 |     bgt    4b
1695 | .if \granularity <= 2
1696 |     tstne  r2, #3
1697 |     moveq  r0, ip
1698 |     bxeq   lr
1699 | .endif
1700 | .if \granularity <= 2
1701 |     tst    r2, #2
1702 |     ldrhne r3, [r1], #2
1703 |     strhne r3, [r0], #2
1704 | .endif
1705 | .if \granularity == 1
1706 |     tst    r2, #1
1707 |     ldrbne r3, [r1]
1708 |     strbne r3, [r0]
1709 | .endif
1710 |     mov    r0, ip
1711 |     bx     lr
1712 | 
1713 | .if \check_small_size_alignment == 1
1714 | .if \granularity <= 2
1715 | 2:
1716 |     /* Align the destination. */
1717 | .if \granularity == 1
1718 |     tst     r0, #1
1719 | .if \preload_early == 1 && \line_size == 32
1720 |     pld     [r3, #32]
1721 | .endif
1722 |     ldrbne  r3, [r1], #1
1723 |     subne   r2, r2, #1
1724 |     strbne  r3, [r0], #1
1725 | .endif
1726 | 
1727 |     tst     r0, #2
1728 | .if \granularity == 2 && \preload_early == 1 && \line_size == 32
1729 |     pld     [r3, #32]
1730 | .endif
1731 |     ldrbne  r3, [r1], #1
1732 |     ldrbne  ip, [r1], #1
1733 |     subne   r2, r2, #2
1734 |     orrne   r3, r3, ip, asl #8
1735 |     strhne  r3, [r0], #2
1736 |     b       3b
1737 | .endif
1738 | .endif
1739 | 
1740 |     /* Aligning this branch target to a 16-byte boundary helps performance a bit. */
1741 | .p2align 4
1742 | 1:
1743 |     /* Check that both destination and source are word aligned. */
1744 | .if \granularity <= 2
1745 |     tst     r0, #3
1746 | .endif
1747 |     push    {r0}
1748 | .if \granularity == 1
1749 |     tsteq   r1, #3
1750 | .endif
1751 | .if \preload_early == 1
1752 |     pld     [r3, #\line_size]
1753 | .endif
1754 |     push    {r3}
1755 | .if \granularity <= 2
1756 |     bne     3f
1757 | .endif
1758 | 
1759 |     /* Larger sizes with word aligned source and destination. */
1760 | 2:
1761 | .if \custom_write_align == 1
1762 |     MEMCPY_VARIANT_SIMPLE_WRITE_ALIGN_CUSTOM
1763 | .else
1764 |     MEMCPY_VARIANT_SIMPLE_WRITE_ALIGN \write_align
1765 | .endif
1766 |     /*
1767 |      * We don't need a check if the number of bytes left is guaranteed to
1768 |      * be >= line_size.
1769 |      */
1770 | .if (68 - (\write_align - 1)) >= \line_size
1771 |     pop     {r3}
1772 | .if \write_align == \line_size && 0
1773 |     mov     ip, #\preload_offset
1774 | .if \preload_early == 1
1775 |     pld     [r3, #(\line_size * 2)]
1776 | .endif
1777 |     push    {r4-r11}
1778 | .else
1779 |     add     ip, r1, #\preload_offset
1780 | .if \preload_early == 1
1781 |     pld     [r3, #(\line_size * 2)]
1782 | .endif
1783 |     push    {r4-r11}
1784 |     bic     ip, ip, #(\line_size - 1)
1785 |     sub     ip, ip, r1
1786 | .endif
1787 | .else
1788 |     cmp     r2, #\line_size
1789 | .if \write_align == \line_size && 0
1790 |     pop     {r3}
1791 |     mov     ip, #\preload_offset
1792 | .if \preload_early == 1
1793 |     pld     [r3, #(\line_size * 2)]
1794 | .endif
1795 |     pushge  {r4-r11}
1796 |     blt     6f
1797 | .else
1798 |     pop     {r3}
1799 |     addge   ip, r1, #\preload_offset
1800 | .if \preload_early == 1
1801 |     pld     [r3, #(\line_size * 2)]
1802 | .endif
1803 |     pushge  {r4-r11}
1804 |     blt     6f
1805 |     bic     ip, ip, #(\line_size - 1)
1806 |     sub     ip, ip, r1
1807 | .endif
1808 | .endif
1809 | 
1810 | .if \line_size == 32
1811 | .if \preload_early == 1 && \preload_offset >= 96
1812 | .if \preload_catch_up == 1
1813 |     add     r4, r1, ip
1814 |     add     r3, r3, #(\line_size * 3)
1815 |     cmp     r3, r4
1816 |     addlt   r3, r3, #\line_size
1817 |     bge     12f
1818 | 11:
1819 |     cmp     r3, r4
1820 |     pld     [r3, #-\line_size]
1821 |     add     r3, r3, #\line_size
1822 |     blt     11b
1823 | 12:
1824 | .else
1825 |     pld     [r3, #(\line_size * 3)]
1826 | .endif
1827 | .endif
1828 |     sub     r2, r2, #32
1829 | 5:
1830 |     /*
1831 |      * The main loop for large sizes. Copy 32 bytes at a time
1832 |      * using ldmia/stmia while prefetching a 32-byte aligned
1833 |      * address.
1834 |      */
1835 |     pld     [r1, ip]
1836 | .if \block_write_size == 32
1837 |     ldmia   r1!, {r4-r11}
1838 |     subs    r2, r2, #32
1839 |     stmia   r0!, {r4-r11}
1840 | .else
1841 |     ldmia   r1!, {r4-r7}
1842 |     subs    r2, r2, #32
1843 |     ldmia   r1!, {r8-r11}
1844 |     stmia   r0!, {r4-r7}
1845 |     stmia   r0!, {r8-r11}
1846 | .endif
1847 |     bge     5b
1848 |     adds    r2, r2, #32
1849 |     pop     {r4-r11}
1850 |     popeq   {r0}
1851 |     bxeq    lr
1852 | .endif
1853 | 
1854 | .if \line_size == 64
1855 | .if \preload_early == 1 && \preload_offset >= 128
1856 | .if \preload_catch_up == 1
1857 |     add     r4, r1, ip
1858 |     add     r3, r3, #(\line_size * 3)
1859 |     cmp     r3, r4
1860 |     addlt   r3, r3, #\line_size
1861 |     bge     12f
1862 | 11:
1863 |     cmp     r3, r4
1864 |     pld     [r3, #-\line_size]
1865 |     add     r3, r3, #\line_size
1866 |     blt     11b
1867 | 12:
1868 | .else
1869 |     pld     [r3, #(\line_size * 3)]
1870 | .endif
1871 | .endif
1872 |     sub     r2, r2, #64
1873 |     /* Aligning the main loop branch target seems to help performance a bit. */
1874 |     b       5f
1875 | .p2align 4
1876 | 5:
1877 |     /*
1878 |      * The main loop for large sizes. Copy 64 bytes at a time
1879 |      * using ldmia/stmia while prefetching a 64-byte aligned
1880 |      * address.
1881 |      */
1882 |     pld     [r1, ip]
1883 |     ldmia   r1!, {r4-r11}
1884 |     subs    r2, r2, #64
1885 |     stmia   r0!, {r4-r11}
1886 |     ldmia   r1!, {r4-r11}
1887 |     stmia   r0!, {r4-r11}
1888 |     bge     5b
1889 |     adds    r2, r2, #64
1890 |     pop     {r4-r11}
1891 |     popeq   {r0}
1892 |     bxeq    lr
1893 | .endif
1894 | 
1895 | 6:
1896 | .if \line_size == 64
1897 |     cmp     r2, #32
1898 |     ldmiage r1!, {r3, ip}
1899 |     blt     10f
1900 |     stmia   r0!, {r3, ip}
1901 |     ldmia   r1!, {r3, ip}
1902 |     stmia   r0!, {r3, ip}
1903 |     ldmia   r1!, {r3, ip}
1904 |     stmia   r0!, {r3, ip}
1905 |     ldmia   r1!, {r3, ip}
1906 |     sub     r2, r2, #32
1907 |     stmia   r0!, {r3, ip}
1908 |     popeq   {r0}
1909 |     bxeq    lr
1910 | 10:
1911 | .endif
1912 | 
1913 |     cmp     r2, #16
1914 |     ldrge   r3, [r1]
1915 |     ldrge   ip, [r1, #4]
1916 |     blt     7f
1917 |     sub     r2, r2, #16
1918 |     str     r3, [r0]
1919 |     str     ip, [r0, #4]
1920 |     ldr     r3, [r1, #8]
1921 |     ldr     ip, [r1, #12]
1922 |     add     r1, r1, #16
1923 |     str     r3, [r0, #8]
1924 |     str     ip, [r0, #12]
1925 |     popeq   {r0}
1926 |     bxeq    lr
1927 |     add     r0, r0, #16
1928 | 7:
1929 |     cmp     r2, #8
1930 |     ldrge   ip, [r1]
1931 |     ldrge   r3, [r1, #4]
1932 |     strge   ip, [r0], #4
1933 |     pop     {ip}
1934 |     strge   r3, [r0], #4
1935 |     moveq   r0, ip
1936 |     bxeq    lr
1937 |     addge   r1, r1, #8
1938 | 
1939 |     tst     r2, #4
1940 |     ldrne   r3, [r1], #4
1941 |     strne   r3, [r0], #4
1942 |     tst     r2, #3
1943 |     moveq   r0, ip
1944 |     bxeq    lr
1945 | .if \granularity <= 2
1946 |     tst     r2, #2
1947 |     ldrhne  r3, [r1], #2
1948 |     strhne  r3, [r0], #2
1949 | .endif
1950 | .if \granularity == 1
1951 |     tst     r2, #1
1952 |     ldrbne  r3, [r1]
1953 |     strbne  r3, [r0]
1954 | .endif
1955 |     mov     r0, ip
1956 |     bx      lr
1957 | 
1958 | .if \granularity <= 2
1959 | 3:
1960 |     /*
1961 |      * Copy data until destination address is 4 bytes aligned.
1962 |      */
1963 | .if \granularity == 1
1964 |     tst     r0, #1
1965 |     ldrbne  r3, [r1], #1
1966 |     subne   r2, r2, #1
1967 |     strbne  r3, [r0], #1
1968 | .endif
1969 | 
1970 |     tst     r0, #2
1971 |     ldrbne  r3, [r1], #1
1972 |     ldrbne  ip, [r1], #1
1973 |     subne   r2, r2, #2
1974 |     orrne   r3, r3, ip, asl #8
1975 |     strhne  r3, [r0], #2
1976 |     /* destination address is 4 bytes aligned */
1977 | 
1978 |     tst     r1, #3
1979 |     popne   {r3}
1980 |     beq     2b
1981 | 
1982 |     /* Unaligned copy. */
1983 | .if \granularity == 1
1984 |     tst     r1, #1
1985 | .endif
1986 | .if \preload_early == 1
1987 |     pld     [r3, #(\line_size * 2)]
1988 | .endif
1989 | .if \granularity == 1
1990 |     bne     2f
1991 | .endif
1992 | 
1993 |     MEMCPY_VARIANT_SIMPLE_UNALIGNED_MAIN_PART 2, line_size, write_align, block_write_size, preload_offset, custom_write_align
1994 | 4:
1995 |     tst     r2, #2
1996 |     ldrbne  r3, [r1], #1
1997 |     ldrbne  ip, [r1], #1
1998 |     strbne  r3, [r0], #1
1999 |     strbne  ip, [r0], #1
2000 | 
2001 | .if \granularity == 1
2002 |     tst     r2, #1
2003 |     mov     ip, r0
2004 |     ldrbne  r3, [r1]
2005 |     ldr     r0, [sp], #4
2006 |     strbne  r3, [ip]
2007 | .else
2008 |     pop     {r0}
2009 | .endif
2010 |     bx      lr
2011 | 
2012 | .if \granularity == 1
2013 | 3:
2014 |     MEMCPY_VARIANT_SIMPLE_UNALIGNED_MAIN_PART 3, line_size, write_align, block_write_size, preload_offset, custom_write_align
2015 |     b       4b
2016 | 
2017 | 2:
2018 |     tst     r1, #2
2019 |     bne     3b
2020 | 
2021 |     MEMCPY_VARIANT_SIMPLE_UNALIGNED_MAIN_PART 1, line_size, write_align, block_write_size, preload_offset, custom_write_align
2022 |     b       4b
2023 | .endif
2024 | 
2025 | .endif
2026 | 
2027 | .endm
2028 | 
2029 | /*
2030 |  * Settings for the MEMCPY_VARIANT_SIMPLE macro
2031 |  *
2032 |  * granularity
2033 |  *     Must be 1, 2 or 4. This value is 1 for normal memcpy, 2 for operations on half-word
2034 |  *     aligned regions such as 16bpp framebuffers/images, and 4 for operations on word aligned
2035 |  *     regions such as 32bpp framebuffers\images.
2036 |  * line_size
2037 |  *     Must be 32 or 64. Defines the cache line size used for preloads. Preloads are only done
2038 |  *     at line_size aligned addresses. When early preload is enabled, the current implementation
2039 |  *     results in more aggressive early preload in the case of a line size of 64.
2040 |  * write_align
2041 |  *     Must be 16, 32, or 64. Defines the write alignment that is applied just before the main loop
2042 |  *     for larger sizes. The main loop processes chunks of line_size bytes at a time.
2043 |  * block_write_size
2044 |  *     Must must be 32, 16. Defines the size of multiple-register load and store instructions that
2045 |  *     are used in the main loop for larger sizes.
2046 |  * preload_offset
2047 |  *     Must be a multiple of line_size. Defines the offset from the current source address at which
2048 |  *     preloads are performed (look-ahead) in the main loop. The real applied offset is derived before
2049 |  *     the start of the main loop by adding the preload offset to the source address and rounding
2050 |  *     down the result to a line_size boundary, and then substracting the source address.
2051 |  * preload_catch_up
2052 |  *     Must be 0 or 1. When early preload is enabled, this enables code just before the main loop
2053 |  *     that performs a series of preloads from just beyond the last early preload to just before
2054 |  *     the first preload in the main loop, filling in the gap.
2055 |  * preload_early
2056 |  *     Must be 0 or 1. When enabled, preload instructions are enabled early in the memcpy function
2057 |  *     to preload the initial part of the source memory region. Early preloads start at the source
2058 |  *     address aligned to a line_size boundary and end at that address + line_size * 2 (three
2059 |  *     early preloads in total).
2060 |  * overfetch
2061 |  *     Must be 1.
2062 |  * custom_write_align
2063 |  *     Must be 0 or 1. Enables RPi-specific write alignment whereby 32-byte alignment is only applied
2064 |  *     if the source address will be located after alignment in the second half of a 32-byte aligned
2065 |  *     chunk; if not, write alignment remains at 16 bytes.
2066 |  * check_small_size_alignment
2067 |  *     Must be 0 or 1. For small sizes less than 68 bytes, unaligned memory access is used to reduce
2068 |  *     overhead in improve performance. However, when both source and destination are unaligned
2069 |  *     this induce a performance penalty. When this option is enabled, beyond a certain size threshold
2070 |  *     (currently set at 32 bytes), the destination is aligned to a word boundary. This may speed up
2071 |  *     unaligned copies in the range of 33 to 67 bytes.
2072 |  *
2073 |  * Restrictions:
2074 |  *     If line_size is 64, write_align must be 32 or 64, block_write_size must be 32, preload_offset
2075 |  *     must be a multiple of 64.
2076 |  *     If preload_catch_up is 1 then preload_early must be 1.
2077 |  */
2078 | 
2079 | 
2080 | #if defined(MEMCPY_REPLACEMENT_SUNXI) || defined(MEMCPY_REPLACEMENT_RPI)
2081 | 
2082 | #ifdef MEMCPY_REPLACEMENT_SUNXI
2083 | 
2084 | /* memcpy replacement for the Allwinner platform. */
2085 | 
2086 | asm_function memcpy
2087 |     MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 192, 0, 1, 1, 0, 0
2088 | .endfunc
2089 | 
2090 | #endif
2091 | 
2092 | #ifdef MEMCPY_REPLACEMENT_RPI
2093 | 
2094 | /* memcpy replacement for the RPi platform. */
2095 | 
2096 | asm_function memcpy
2097 |     MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 128, 1, 1, 1, 1, 1
2098 | .endfunc
2099 | 
2100 | #endif
2101 | 
2102 | #else
2103 | 
2104 | #ifdef RPI_BEST_MEMCPY_ONLY
2105 | 
2106 | /* Optimized memcpy variants for the RPi platform . */
2107 | 
2108 | asm_function memcpy_armv5te_no_overfetch
2109 |     MEMCPY_VARIANT 1, 32, 16, 16, 96, 1, 0
2110 | .endfunc
2111 | 
2112 | asm_function memcpy_armv5te_overfetch
2113 |     MEMCPY_VARIANT 1, 32, 16, 16, 128, 1, 1
2114 | .endfunc
2115 | 
2116 | asm_function memcpy_halfwords_armv5te_no_overfetch
2117 |     MEMCPY_VARIANT 2, 32, 16, 16, 96, 1, 0
2118 | .endfunc
2119 | 
2120 | asm_function memcpy_halfwords_armv5te_overfetch
2121 |     MEMCPY_VARIANT 2, 32, 16, 16, 128, 1, 1
2122 | .endfunc
2123 | 
2124 | asm_function memcpy_words_armv5te_no_overfetch
2125 |     MEMCPY_VARIANT 4, 32, 16, 16, 96, 1, 0
2126 | .endfunc
2127 | 
2128 | asm_function memcpy_words_armv5te_overfetch
2129 |     MEMCPY_VARIANT 4, 32, 16, 16, 128, 1, 1
2130 | .endfunc
2131 | 
2132 | #else
2133 | 
2134 | /* A large set of memcpy variants, used in the benchmark program */
2135 | 
2136 | asm_function memcpy_armv5te_no_overfetch_align_16_block_write_8_preload_96
2137 |     MEMCPY_VARIANT 1, 32, 16, 8, 96, 0, 0
2138 | .endfunc
2139 | 
2140 | asm_function memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_96
2141 |     MEMCPY_VARIANT 1, 32, 16, 16, 96, 0, 0
2142 | .endfunc
2143 | 
2144 | asm_function memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_96
2145 |     MEMCPY_VARIANT 1, 32, 16, 16, 96, 1, 0
2146 | .endfunc
2147 | 
2148 | asm_function memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_128
2149 |     MEMCPY_VARIANT 1, 32, 16, 16, 128, 1, 0
2150 | .endfunc
2151 | 
2152 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_8_preload_96
2153 |     MEMCPY_VARIANT 1, 32, 32, 8, 96, 0, 0
2154 | .endfunc
2155 | 
2156 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_64
2157 |     MEMCPY_VARIANT 1, 32, 32, 16, 64, 0, 0
2158 | .endfunc
2159 | 
2160 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_96
2161 |     MEMCPY_VARIANT 1, 32, 32, 16, 96, 0, 0
2162 | .endfunc
2163 | 
2164 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_128
2165 |     MEMCPY_VARIANT 1, 32, 32, 16, 128, 0, 0
2166 | .endfunc
2167 | 
2168 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_160
2169 |     MEMCPY_VARIANT 1, 32, 32, 16, 160, 0, 0
2170 | .endfunc
2171 | 
2172 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_192
2173 |     MEMCPY_VARIANT 1, 32, 32, 16, 192, 0, 0
2174 | .endfunc
2175 | 
2176 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_256
2177 |     MEMCPY_VARIANT 1, 32, 32, 16, 256, 0, 0
2178 | .endfunc
2179 | 
2180 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_64
2181 |     MEMCPY_VARIANT 1, 32, 32, 32, 64, 0, 0
2182 | .endfunc
2183 | 
2184 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_96
2185 |     MEMCPY_VARIANT 1, 32, 32, 32, 96, 0, 0
2186 | .endfunc
2187 | 
2188 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_128
2189 |     MEMCPY_VARIANT 1, 32, 32, 32, 128, 0, 0
2190 | .endfunc
2191 | 
2192 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_160
2193 |     MEMCPY_VARIANT 1, 32, 32, 32, 160, 0, 0
2194 | .endfunc
2195 | 
2196 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_192
2197 |     MEMCPY_VARIANT 1, 32, 32, 32, 192, 0, 0
2198 | .endfunc
2199 | 
2200 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_256
2201 |     MEMCPY_VARIANT 1, 32, 32, 32, 256, 0, 0
2202 | .endfunc
2203 | 
2204 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_96
2205 |     MEMCPY_VARIANT 1, 32, 32, 16, 96, 1, 0
2206 | .endfunc
2207 | 
2208 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_128
2209 |     MEMCPY_VARIANT 1, 32, 32, 16, 128, 1, 0
2210 | .endfunc
2211 | 
2212 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_192
2213 |     MEMCPY_VARIANT 1, 32, 32, 16, 192, 1, 0
2214 | .endfunc
2215 | 
2216 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_256
2217 |     MEMCPY_VARIANT 1, 32, 32, 16, 256, 1, 0
2218 | .endfunc
2219 | 
2220 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_128
2221 |     MEMCPY_VARIANT 1, 32, 32, 32, 128, 1, 0
2222 | .endfunc
2223 | 
2224 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_192
2225 |     MEMCPY_VARIANT 1, 32, 32, 32, 192, 1, 0
2226 | .endfunc
2227 | 
2228 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_256
2229 |     MEMCPY_VARIANT 1, 32, 32, 32, 256, 1, 0
2230 | .endfunc
2231 | 
2232 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_no_preload
2233 |     MEMCPY_VARIANT 1, 32, 32, 16, 0, 0, 0
2234 | .endfunc
2235 | 
2236 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_no_preload
2237 |     MEMCPY_VARIANT 1, 32, 32, 32, 0, 0, 0
2238 | .endfunc
2239 | 
2240 | asm_function memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_128
2241 |     MEMCPY_VARIANT 1, 64, 32, 32, 128, 1, 0
2242 | .endfunc
2243 | 
2244 | asm_function memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_192
2245 |     MEMCPY_VARIANT 1, 64, 32, 32, 192, 1, 0
2246 | .endfunc
2247 | 
2248 | asm_function memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_256
2249 |     MEMCPY_VARIANT 1, 64, 32, 32, 256, 1, 0
2250 | .endfunc
2251 | 
2252 | asm_function memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_320
2253 |     MEMCPY_VARIANT 1, 64, 32, 32, 320, 1, 0
2254 | .endfunc
2255 | 
2256 | asm_function memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_192
2257 |     MEMCPY_VARIANT 1, 64, 64, 32, 192, 1, 0
2258 | .endfunc
2259 | 
2260 | asm_function memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_256
2261 |     MEMCPY_VARIANT 1, 64, 64, 32, 256, 1, 0
2262 | .endfunc
2263 | 
2264 | asm_function memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_320
2265 |     MEMCPY_VARIANT 1, 64, 64, 32, 320, 1, 0
2266 | .endfunc
2267 | 
2268 | /* Overfetching versions. */
2269 | 
2270 | asm_function memcpy_armv5te_overfetch_align_16_block_write_16_preload_early_128
2271 |     MEMCPY_VARIANT 1, 32, 16, 16, 128, 1, 1
2272 | .endfunc
2273 | 
2274 | asm_function memcpy_armv5te_overfetch_align_32_block_write_32_preload_early_192
2275 |     MEMCPY_VARIANT 1, 32, 32, 32, 192, 1, 1
2276 | .endfunc
2277 | 
2278 | asm_function memcpy_simple_sunxi_preload_early_192
2279 |     MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 192, 1, 1, 1, 0, 0
2280 | .endfunc
2281 | 
2282 | asm_function memcpy_simple_sunxi_preload_early_192_no_catch_up
2283 |     MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 192, 0, 1, 1, 0, 0
2284 | .endfunc
2285 | 
2286 | asm_function memcpy_simple_sunxi_preload_early_192_no_catch_up_check_small_size_alignment
2287 |     MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 192, 0, 1, 1, 0, 1
2288 | .endfunc
2289 | 
2290 | asm_function memcpy_simple_sunxi_preload_early_256
2291 |     MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 256, 1, 1, 1, 0, 0
2292 | .endfunc
2293 | 
2294 | asm_function memcpy_simple_sunxi_preload_early_256_no_catch_up
2295 |     MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 256, 0, 1, 1, 0, 0
2296 | .endfunc
2297 | 
2298 | asm_function memcpy_simple_rpi_preload_early_96
2299 |     MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 96, 1, 1, 1, 1, 1
2300 | .endfunc
2301 | 
2302 | asm_function memcpy_simple_rpi_preload_early_96_no_catch_up
2303 |     MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 96, 0, 1, 1, 1, 0
2304 | .endfunc
2305 | 
2306 | asm_function memcpy_simple_rpi_preload_early_96_no_catch_up_check_small_size_alignment
2307 |     MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 96, 0, 1, 1, 1, 1
2308 | .endfunc
2309 | 
2310 | asm_function memcpy_simple_rpi_preload_early_128
2311 |     MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 128, 1, 1, 1, 1, 1
2312 | .endfunc
2313 | 
2314 | asm_function memcpy_simple_rpi_preload_early_128_no_catch_up
2315 |     MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 128, 0, 1, 1, 1, 1
2316 | .endfunc
2317 | 
2318 | #endif
2319 | 
2320 | #endif
2321 | 
2322 | #endif
2323 | 


--------------------------------------------------------------------------------
/arm_asm.h:
--------------------------------------------------------------------------------
  1 | 
  2 | extern void *memcpy_armv5te(void *dest, const void *src, size_t n);
  3 | 
  4 | #ifdef RPI_BEST_MEMCPY_ONLY
  5 | 
  6 | extern void *memcpy_armv5te_no_overfetch(void *dest, const void *src, size_t n);
  7 | 
  8 | extern void *memcpy_armv5te_overfetch(void *dest, const void *src, size_t n);
  9 | 
 10 | extern void *memcpy_halfwords_armv5te_no_overfetch(void *dest, const void *src, size_t n);
 11 | 
 12 | extern void *memcpy_halfwords_armv5te_overfetch(void *dest, const void *src, size_t n);
 13 | 
 14 | extern void *memcpy_words_armv5te_no_overfetch(void *dest, const void *src, size_t n);
 15 | 
 16 | extern void *memcpy_words_armv5te_overfetch(void *dest, const void *src, size_t n);
 17 | 
 18 | #else
 19 | 
 20 | extern void *memcpy_armv5te_no_overfetch_align_16_block_write_8_preload_96(void *dest,
 21 |     const void *src, size_t n);
 22 | 
 23 | extern void *memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_96(void *dest,
 24 |     const void *src, size_t n);
 25 | 
 26 | extern void *memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_96(void *dest,
 27 |     const void *src, size_t n);
 28 | 
 29 | extern void *memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_128(void *dest,
 30 |     const void *src, size_t n);
 31 | 
 32 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_8_preload_96(void *dest,
 33 |     const void *src, size_t n);
 34 | 
 35 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_64(void *dest,
 36 |     const void *src, size_t n);
 37 | 
 38 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_96(void *dest,
 39 |     const void *src, size_t n);
 40 | 
 41 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_128(void *dest,
 42 |     const void *src, size_t n);
 43 | 
 44 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_160(void *dest,
 45 |     const void *src, size_t n);
 46 | 
 47 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_192(void *dest,
 48 |     const void *src, size_t n);
 49 | 
 50 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_256(void *dest,
 51 |     const void *src, size_t n);
 52 | 
 53 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_64(void *dest,
 54 |     const void *src, size_t n);
 55 | 
 56 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_96(void *dest,
 57 |     const void *src, size_t n);
 58 | 
 59 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_128(void *dest,
 60 |     const void *src, size_t n);
 61 | 
 62 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_160(void *dest,
 63 |     const void *src, size_t n);
 64 | 
 65 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_192(void *dest,
 66 |     const void *src, size_t n);
 67 | 
 68 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_256(void *dest,
 69 |     const void *src, size_t n);
 70 | 
 71 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_96(void *dest,
 72 |     const void *src, size_t n);
 73 | 
 74 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_128(void *dest,
 75 |     const void *src, size_t n);
 76 | 
 77 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_192(void *dest,
 78 |     const void *src, size_t n);
 79 | 
 80 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_256(void *dest,
 81 |     const void *src, size_t n);
 82 | 
 83 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_128(void *dest,
 84 |     const void *src, size_t n);
 85 | 
 86 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_192(void *dest,
 87 |     const void *src, size_t n);
 88 | 
 89 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_256(void *dest,
 90 |     const void *src, size_t n);
 91 | 
 92 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_no_preload(void *dest,
 93 |     const void *src, size_t n);
 94 | 
 95 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_no_preload(void *dest,
 96 |     const void *src, size_t n);
 97 | 
 98 | extern void *memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_128(void *dest,
 99 |     const void *src, size_t n);
100 | 
101 | extern void *memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_192(void *dest,
102 |     const void *src, size_t n);
103 | 
104 | extern void *memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_256(void *dest,
105 |     const void *src, size_t n);
106 | 
107 | extern void *memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_320(void *dest,
108 |     const void *src, size_t n);
109 | 
110 | extern void *memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_192(void *dest,
111 |     const void *src, size_t n);
112 | 
113 | extern void *memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_256(void *dest,
114 |     const void *src, size_t n);
115 | 
116 | extern void *memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_320(void *dest,
117 |     const void *src, size_t n);
118 | 
119 | extern void *memcpy_armv5te_overfetch_align_16_block_write_16_preload_early_128(void *dest,
120 |     const void *src, size_t n);
121 | 
122 | extern void *memcpy_armv5te_overfetch_align_32_block_write_32_preload_early_192(void *dest,
123 |     const void *src, size_t n);
124 | 
125 | extern void *memcpy_simple_sunxi_preload_early_192(void *dest, const void *src, size_t n);
126 | extern void *memcpy_simple_sunxi_preload_early_192_no_catch_up(void *dest, const void *src, size_t n);
127 | extern void *memcpy_simple_sunxi_preload_early_192_no_catch_up_check_small_size_alignment(void *dest, const void *src, size_t n);
128 | extern void *memcpy_simple_sunxi_preload_early_256(void *dest, const void *src, size_t n);
129 | extern void *memcpy_simple_sunxi_preload_early_256_no_catch_up(void *dest, const void *src, size_t n);
130 | 
131 | extern void *memcpy_simple_rpi_preload_early_96(void *dest, const void *src, size_t n);
132 | extern void *memcpy_simple_rpi_preload_early_96_no_catch_up(void *dest, const void *src, size_t n);
133 | extern void *memcpy_simple_rpi_preload_early_96_no_catch_up_check_small_size_alignment(void *dest, const void *src, size_t n);
134 | extern void *memcpy_simple_rpi_preload_early_128(void *dest, const void *src, size_t n);
135 | extern void *memcpy_simple_rpi_preload_early_128_no_catch_up(void *dest, const void *src, size_t n);
136 | 
137 | #endif
138 | 


--------------------------------------------------------------------------------
/benchmark.c:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * Copyright (C) 2013 Harm Hanemaaijer <fgenfb@yahoo.com>
   3 |  *
   4 |  * Permission is hereby granted, free of charge, to any person obtaining a
   5 |  * copy of this software and associated documentation files (the "Software"),
   6 |  * to deal in the Software without restriction, including without limitation
   7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 |  * and/or sell copies of the Software, and to permit persons to whom the
   9 |  * Software is furnished to do so, subject to the following conditions:
  10 |  *
  11 |  * The above copyright notice and this permission notice (including the next
  12 |  * paragraph) shall be included in all copies or substantial portions of the
  13 |  * Software.
  14 |  *
  15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21 |  * DEALINGS IN THE SOFTWARE.
  22 |  *
  23 |  */
  24 | 
  25 | #include <stdlib.h>
  26 | #include <stdio.h>
  27 | #include <string.h>
  28 | #include <stdint.h>
  29 | #include <unistd.h>
  30 | #include <ctype.h>
  31 | #include <time.h>
  32 | #include <sys/time.h>
  33 | #include <math.h>
  34 | 
  35 | #include "arm_asm.h"
  36 | #include "new_arm.h"
  37 | #ifdef INCLUDE_MEMCPY_HYBRID
  38 | #include "memcpy-hybrid.h"
  39 | #endif
  40 | 
  41 | #define DEFAULT_TEST_DURATION 2.0
  42 | #define RANDOM_BUFFER_SIZE 256
  43 | 
  44 | #ifdef INCLUDE_LIBARMMEM_MEMCPY
  45 | 
  46 | void *armmem_memcpy(void * restrict s1, const void * restrict s2, size_t n);
  47 | 
  48 | #define LIBARMMEM_COUNT 1
  49 | #else
  50 | #define LIBARMMEM_COUNT 0
  51 | #endif
  52 | 
  53 | #ifdef INCLUDE_MEMCPY_HYBRID
  54 | #define MEMCPY_HYBRID_COUNT 1
  55 | #else
  56 | #define MEMCPY_HYBRID_COUNT 0
  57 | #endif
  58 | 
  59 | #define NU_MEMCPY_VARIANTS (57 + LIBARMMEM_COUNT + MEMCPY_HYBRID_COUNT)
  60 | #define NU_MEMSET_VARIANTS 5
  61 | 
  62 | 
  63 | typedef void *(*memcpy_func_type)(void *dest, const void *src, size_t n);
  64 | typedef void *(*memset_func_type)(void *dest, int c, size_t n);
  65 | 
  66 | memcpy_func_type memcpy_func;
  67 | memset_func_type memset_func;
  68 | uint8_t *buffer_alloc, *buffer_chunk, *buffer_page, *buffer_compare;
  69 | int *random_buffer_1024, *random_buffer_1M, *random_buffer_powers_of_two_up_to_4096_power_law;
  70 | int *random_buffer_multiples_of_four_up_to_1024_power_law, *random_buffer_up_to_1023_power_law;
  71 | double test_duration = DEFAULT_TEST_DURATION;
  72 | int memcpy_mask[NU_MEMCPY_VARIANTS];
  73 | int memset_mask[NU_MEMSET_VARIANTS];
  74 | int test_alignment;
  75 | 
  76 | static const char *memcpy_variant_name[NU_MEMCPY_VARIANTS] = {
  77 |     "standard memcpy",
  78 | #ifdef INCLUDE_LIBARMMEM_MEMCPY
  79 |     "libarmmem memcpy",
  80 | #endif
  81 | #ifdef INCLUDE_MEMCPY_HYBRID
  82 |     "cortex-strings memcpy-hybrid (NEON)",
  83 | #endif
  84 |     "armv5te memcpy",
  85 |     "new memcpy for cortex with line size of 32, preload offset of 192",
  86 |     "new memcpy for cortex with line size of 64, preload offset of 192",
  87 |     "new memcpy for cortex using NEON with line size 32, preload offset 192",
  88 |     "new memcpy for cortex using NEON with line size 64, preload offset 192",
  89 |     "new memcpy for cortex using NEON with line size 32, only early preload (relying on automatic prefetcher)",
  90 |     "new memcpy for sunxi with line size of 64, preload offset of 192 and write alignment of 32",
  91 |     "new memcpy for sunxi with line size of 64, preload offset of 192 and aligned access",
  92 |     "new memcpy for sunxi with line size of 32, preload offset of 192 and write alignment of 32",
  93 |     "new memcpy for rpi with preload offset of 96, write alignment of 8",
  94 |     "new memcpy for rpi with preload offset of 96, write alignment of 8 and aligned access",
  95 |     "simplified memcpy for sunxi with preload offset of 192, early preload and preload catch up",
  96 |     "simplified memcpy for sunxi with preload offset of 192, early preload and no preload catch up",
  97 |     "simplified memcpy for sunxi with preload offset of 192, early preload, no preload catch up and with small size alignment check",
  98 |     "simplified memcpy for sunxi with preload offset of 256, early preload and preload catch up",
  99 |     "simplified memcpy for sunxi with preload offset of 256, early preload and no preload catch up",
 100 |     "simplified memcpy for rpi with preload offset of 96, early preload and preload catch up",
 101 |     "simplified memcpy for rpi with preload offset of 96, early preload and no preload catch up",
 102 |     "simplified memcpy for rpi with preload offset of 96, early preload and no preload catch up and with small size alignment check",
 103 |     "simplified memcpy for rpi with preload offset of 128, early preload and preload catch up",
 104 |     "simplified memcpy for rpi with preload offset of 128, early preload and no preload catch up",
 105 |     "armv5te non-overfetching memcpy with write alignment of 16 and block write size of 8, preload offset 96",
 106 |     "armv5te non-overfetching memcpy with write alignment of 16 and block write size of 16, preload offset 96",
 107 |     "armv5te non-overfetching memcpy with write alignment of 16 and block write size of 16, preload offset 96 with early preload",
 108 |     "armv5te non-overfetching memcpy with write alignment of 16 and block write size of 16, preload offset 128 with early preload",
 109 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 8, preload offset 96",
 110 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 64",
 111 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 96",
 112 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 128",
 113 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 160",
 114 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 192",
 115 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 256",
 116 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 64",
 117 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 96",
 118 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 128",
 119 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 160",
 120 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 192",
 121 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 256",
 122 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 96 with early preload",
 123 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 128 with early preload",
 124 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 192 with early preload",
 125 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 256 with early preload",
 126 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 128 with early preload",
 127 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 192 with early preload",
 128 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 256 with early preload",
 129 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, no preload",
 130 |     "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, no preload",
 131 |     "armv5te non-overfetching memcpy with line size of 64, write alignment of 32 and block write size of 32, preload offset 128 with early preload",
 132 |     "armv5te non-overfetching memcpy with line size of 64, write alignment of 32 and block write size of 32, preload offset 192 with early preload",
 133 |     "armv5te non-overfetching memcpy with line size of 64, write alignment of 32 and block write size of 32, preload offset 256 with early preload",
 134 |     "armv5te non-overfetching memcpy with line_size of 64, write alignment of 32 and block write size of 32, preload offset 320 with early preload",
 135 |     "armv5te non-overfetching memcpy with line size of 64, write alignment of 64 and block write size of 32, preload offset 192 with early preload",
 136 |     "armv5te non-overfetching memcpy with line size of 64, write alignment of 64 and block write size of 32, preload offset 256 with early preload",
 137 |     "armv5te non-overfetching memcpy with line_size of 64, write alignment of 64 and block write size of 32, preload offset 320 with early preload",
 138 |     "armv5te overfetching memcpy with write alignment of 16 and block write size of 16, preload offset 128 with early preload",
 139 |     "armv5te overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 192 with early preload"
 140 | };
 141 | 
 142 | static const memcpy_func_type memcpy_variant[NU_MEMCPY_VARIANTS] = {
 143 |     memcpy,
 144 | #ifdef INCLUDE_LIBARMMEM_MEMCPY
 145 |     armmem_memcpy,
 146 | #endif
 147 | #ifdef INCLUDE_MEMCPY_HYBRID
 148 |     memcpy_hybrid,
 149 | #endif
 150 |     memcpy_armv5te,
 151 |     memcpy_new_line_size_32_preload_192,
 152 |     memcpy_new_line_size_64_preload_192,
 153 |     memcpy_new_neon_line_size_32,
 154 |     memcpy_new_neon_line_size_64,
 155 |     memcpy_new_neon_line_size_32_auto,
 156 |     memcpy_new_line_size_64_preload_192_align_32,
 157 |     memcpy_new_line_size_64_preload_192_aligned_access,
 158 |     memcpy_new_line_size_32_preload_192_align_32,
 159 |     memcpy_new_line_size_32_preload_96,
 160 |     memcpy_new_line_size_32_preload_96_aligned_access,
 161 |     memcpy_simple_sunxi_preload_early_192,
 162 |     memcpy_simple_sunxi_preload_early_192_no_catch_up,
 163 |     memcpy_simple_sunxi_preload_early_192_no_catch_up_check_small_size_alignment,
 164 |     memcpy_simple_sunxi_preload_early_256,
 165 |     memcpy_simple_sunxi_preload_early_256_no_catch_up,
 166 |     memcpy_simple_rpi_preload_early_96,
 167 |     memcpy_simple_rpi_preload_early_96_no_catch_up,
 168 |     memcpy_simple_rpi_preload_early_96_no_catch_up_check_small_size_alignment,
 169 |     memcpy_simple_rpi_preload_early_128,
 170 |     memcpy_simple_rpi_preload_early_128_no_catch_up,
 171 |     memcpy_armv5te_no_overfetch_align_16_block_write_8_preload_96,
 172 |     memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_96,
 173 |     memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_96,
 174 |     memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_128,
 175 |     memcpy_armv5te_no_overfetch_align_32_block_write_8_preload_96,
 176 |     memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_64,
 177 |     memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_96,
 178 |     memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_128,
 179 |     memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_160,
 180 |     memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_192,
 181 |     memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_256,
 182 |     memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_64,
 183 |     memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_96,
 184 |     memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_128,
 185 |     memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_160,
 186 |     memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_192,
 187 |     memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_256,
 188 |     memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_96,
 189 |     memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_128,
 190 |     memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_192,
 191 |     memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_256,
 192 |     memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_128,
 193 |     memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_192,
 194 |     memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_256,
 195 |     memcpy_armv5te_no_overfetch_align_32_block_write_16_no_preload,
 196 |     memcpy_armv5te_no_overfetch_align_32_block_write_32_no_preload,
 197 |     memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_128,
 198 |     memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_192,
 199 |     memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_256,
 200 |     memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_320,
 201 |     memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_192,
 202 |     memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_256,
 203 |     memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_320,
 204 |     memcpy_armv5te_overfetch_align_16_block_write_16_preload_early_128,
 205 |     memcpy_armv5te_overfetch_align_32_block_write_32_preload_early_192
 206 | };
 207 | 
 208 | static const char *memset_variant_name[NU_MEMSET_VARIANTS] = {
 209 |     "libc memset",
 210 |     "optimized memset with write alignment of 0",
 211 |     "optimized memset with write alignment of 8",
 212 |     "optimized memset with write alignment of 32",
 213 |     "NEON memset",
 214 | };
 215 | 
 216 | static const memset_func_type memset_variant[NU_MEMSET_VARIANTS] = {
 217 |     memset,
 218 |     memset_new_align_0,
 219 |     memset_new_align_8,
 220 |     memset_new_align_32,
 221 |     memset_neon
 222 | };
 223 | 
 224 | static double get_time() {
 225 |    struct timespec ts;
 226 |    clock_gettime(CLOCK_REALTIME, &ts);
 227 |    return (double)ts.tv_sec + (double)ts.tv_nsec / 1000000000.0;
 228 | }
 229 | 
 230 | static void test_mixed_powers_of_two_word_aligned(int i) {
 231 |     memcpy_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 232 |         buffer_page + random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 233 |         random_buffer_powers_of_two_up_to_4096_power_law[i & (RANDOM_BUFFER_SIZE - 1)]);
 234 | }
 235 | 
 236 | static void test_mixed_power_law_word_aligned(int i) {
 237 |     memcpy_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 238 |         buffer_page + random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 239 |         random_buffer_multiples_of_four_up_to_1024_power_law[i & (RANDOM_BUFFER_SIZE - 1)]);
 240 | }
 241 | 
 242 | static void test_mixed_power_law_unaligned(int i) {
 243 |     memcpy_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 244 |         buffer_page + random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 245 |         random_buffer_up_to_1023_power_law[i & (RANDOM_BUFFER_SIZE - 1)]);
 246 | }
 247 | 
 248 | static void test_unaligned_random_3(int i) {
 249 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 250 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 251 |         3);
 252 | }
 253 | 
 254 | static void test_unaligned_random_8(int i) {
 255 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 256 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 257 |         8);
 258 | }
 259 | 
 260 | static void test_aligned_4(int i) {
 261 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 262 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 263 |         4);
 264 | }
 265 | 
 266 | static void test_aligned_8(int i) {
 267 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 268 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 269 |         8);
 270 | }
 271 | 
 272 | static void test_aligned_16(int i) {
 273 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 274 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 275 |         16);
 276 | }
 277 | 
 278 | static void test_aligned_32(int i) {
 279 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 280 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 281 |         32);
 282 | }
 283 | 
 284 | static void test_aligned_64(int i) {
 285 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 286 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 287 |         64);
 288 | }
 289 | 
 290 | static void test_aligned_128(int i) {
 291 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 292 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 293 |         128);
 294 | }
 295 | 
 296 | static void test_aligned_256(int i) {
 297 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 298 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 299 |         256);
 300 | }
 301 | 
 302 | static void test_unaligned_random_17(int i) {
 303 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 304 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 305 |         17);
 306 | }
 307 | 
 308 | static void test_unaligned_random_28(int i) {
 309 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 310 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 311 |         28);
 312 | }
 313 | 
 314 | static void test_aligned_28(int i) {
 315 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 316 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 317 |         28);
 318 | }
 319 | 
 320 | static void test_unaligned_random_64(int i) {
 321 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 322 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 323 |         64);
 324 | }
 325 | 
 326 | static void test_unaligned_random_137(int i) {
 327 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 328 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 329 |         137);
 330 | }
 331 | 
 332 | static void test_unaligned_random_1024(int i) {
 333 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 334 |         buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 335 |         1024);
 336 | }
 337 | 
 338 | static void test_unaligned_random_32768(int i) {
 339 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 340 |         buffer_page + 65536 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 341 |         32768);
 342 | }
 343 | 
 344 | static void test_unaligned_random_1M(int i) {
 345 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 346 |         buffer_page + 2 * 1024 * 1024 +
 347 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 348 |         1024 * 1024);
 349 | }
 350 | 
 351 | static void test_source_dest_aligned_random_64(int i) {
 352 |     memcpy_func(buffer_page + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)],
 353 |         buffer_page + 4096 + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)],
 354 |         64);
 355 | }
 356 | 
 357 | static void test_source_dest_aligned_random_1024(int i) {
 358 |     memcpy_func(buffer_page + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)],
 359 |         buffer_page + 4096 + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)],
 360 |         1024);
 361 | }
 362 | 
 363 | static void test_source_dest_aligned_random_32768(int i) {
 364 |     memcpy_func(buffer_page + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)],
 365 |         buffer_page + 65536 + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)],
 366 |         32768);
 367 | }
 368 | 
 369 | static void test_source_dest_aligned_random_1M(int i) {
 370 |     memcpy_func(buffer_page + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)],
 371 |         buffer_page + 2 * 1024 * 1024 +
 372 |         random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)],
 373 |         1024 * 1024);
 374 | }
 375 | 
 376 | static void test_word_aligned_28(int i) {
 377 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 378 |         buffer_page + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 379 |         28);
 380 | }
 381 | 
 382 | static void test_word_aligned_64(int i) {
 383 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 384 |         buffer_page + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 385 |         64);
 386 | }
 387 | 
 388 | static void test_word_aligned_296(int i) {
 389 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 390 |         buffer_page + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 391 |         296);
 392 | }
 393 | 
 394 | static void test_word_aligned_1024(int i) {
 395 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 396 |         buffer_page + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 397 |         1024);
 398 | }
 399 | 
 400 | static void test_word_aligned_4096(int i) {
 401 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 402 |         buffer_page + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 403 |         4096);
 404 | }
 405 | 
 406 | static void test_word_aligned_32768(int i) {
 407 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 408 |         buffer_page + 128 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 409 |         32768);
 410 | }
 411 | 
 412 | static void test_chunk_aligned_64(int i) {
 413 |     memcpy_func(buffer_chunk + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32,
 414 |         buffer_chunk + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 32,
 415 |         64);
 416 | }
 417 | 
 418 | static void test_chunk_aligned_296(int i) {
 419 |     memcpy_func(buffer_chunk + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32,
 420 |         buffer_chunk + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 32,
 421 |         296);
 422 | }
 423 | 
 424 | static void test_chunk_aligned_1024(int i) {
 425 |     memcpy_func(buffer_chunk + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32,
 426 |         buffer_chunk + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 32,
 427 |         1024);
 428 | }
 429 | 
 430 | static void test_chunk_aligned_4096(int i) {
 431 |     memcpy_func(buffer_chunk + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32,
 432 |         buffer_chunk + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 32,
 433 |         4096);
 434 | }
 435 | 
 436 | static void test_chunk_aligned_32768(int i) {
 437 |     memcpy_func(buffer_chunk + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32,
 438 |         buffer_chunk + 128 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 32,
 439 |         32768);
 440 | }
 441 | 
 442 | static void test_page_aligned_1024(int i) {
 443 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 444 |         buffer_page + 8192 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 445 |         1024);
 446 | }
 447 | 
 448 | static void test_page_aligned_4096(int i) {
 449 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 450 |         buffer_page + 8192 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 451 |         4096);
 452 | }
 453 | 
 454 | static void test_page_aligned_32768(int i) {
 455 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 456 |         buffer_page + 8192 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 457 |         32768);
 458 | }
 459 | 
 460 | static void test_page_aligned_256K(int i) {
 461 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 462 |         buffer_page + 8192 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 463 |         256 * 1024);
 464 | }
 465 | 
 466 | static void test_page_aligned_1M(int i) {
 467 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 468 |         buffer_page + 8192 * 1024 + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 469 |         1024 * 1024);
 470 | }
 471 | 
 472 | static void test_page_aligned_8M(int i) {
 473 |     memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 474 |         buffer_page + 16384 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 475 |         8 * 1024 * 1024);
 476 | }
 477 | 
 478 | static void test_random_mixed_sizes_1024(int i) {
 479 |     memcpy_func(buffer_page + random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)],
 480 |         buffer_page + 4096 + random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 481 |         1 + random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))]);
 482 | }
 483 | 
 484 | static void test_random_mixed_sizes_64(int i) {
 485 |     memcpy_func(buffer_page + random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)],
 486 |         buffer_page + 4096 + random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 487 |         1 + (random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))] & 63));
 488 | }
 489 | 
 490 | static void test_random_mixed_sizes_DRAM_1024(int i) {
 491 |     /* Source and destination address selected randomly from range of 8MB. */
 492 |     memcpy_func(buffer_page +
 493 |         // Select a random 8192 bytes aligned addres.
 494 |         8192 * random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] +
 495 |         // Add a random offset up to (4096 - 256) in steps of 256 based on higher bits
 496 |         // of the iteration number.
 497 |         ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 +
 498 |         // Add a random offset up to 1023 in steps of 1 based on the lower end bits
 499 |         // of the iteration number.
 500 |         random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)],
 501 |         buffer_page +
 502 |         8192 * random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] +
 503 |         ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 +
 504 |         random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 505 |         1 + random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))]);
 506 | }
 507 | 
 508 | static void test_random_mixed_sizes_DRAM_64(int i) {
 509 |     /* Source and destination address selected randomly from range of 8MB. */
 510 |     memcpy_func(buffer_page +
 511 |         8192 * random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] +
 512 |         ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 +
 513 |         random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)],
 514 |         buffer_page +
 515 |         8192 * random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] +
 516 |         ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 +
 517 |         random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)],
 518 |         1 + (random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))] & 63));
 519 | }
 520 | 
 521 | static void test_random_mixed_sizes_DRAM_word_aligned_1024(int i) {
 522 |     /* Source and destination address selected randomly from range of 8MB. */
 523 |     memcpy_func(buffer_page +
 524 |         // Select a random 8192 bytes aligned addres.
 525 |         8192 * random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] +
 526 |         // Add a random offset up to (4096 - 256) in steps of 256 based on higher bits
 527 |         // of the iteration number.
 528 |         ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 +
 529 |         // Add a random offset up to 1020 in steps of 4 based on the lower end bits
 530 |         // of the iteration number.
 531 |         (random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)] & (~3)),
 532 |         buffer_page +
 533 |         8192 * random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] +
 534 |         ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 +
 535 |         (random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)] & (~3)),
 536 |         4 + (random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))] & (~3)));
 537 | }
 538 | 
 539 | static void test_random_mixed_sizes_DRAM_word_aligned_256(int i) {
 540 |     /* Source and destination address selected randomly from range of 8MB. */
 541 |     memcpy_func(buffer_page +
 542 |         8192 * random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] +
 543 |         ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 +
 544 |         (random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)] & (~3)),
 545 |         buffer_page +
 546 |         8192 * random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] +
 547 |         ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 +
 548 |         (random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)] & (~3)),
 549 |         4 + (random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))] & 252));
 550 | }
 551 | 
 552 | static void test_random_mixed_sizes_DRAM_word_aligned_64(int i) {
 553 |     /* Source and destination address selected randomly from range of 8MB. */
 554 |     memcpy_func(buffer_page +
 555 |         8192 * random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] +
 556 |         ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 +
 557 |         (random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)] & (~3)),
 558 |         buffer_page +
 559 |         8192 * random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] +
 560 |         ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 +
 561 |         (random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)] & (~3)),
 562 |         4 + (random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))] & 60));
 563 | }
 564 | 
 565 | static void test_memset_page_aligned_1024(int i) {
 566 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 567 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 1024);
 568 | }
 569 | 
 570 | static void test_memset_page_aligned_4096(int i) {
 571 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096,
 572 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 4096);
 573 | }
 574 | 
 575 | static void test_memset_mixed_powers_of_two_word_aligned(int i) {
 576 |     memset_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 577 |         random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 578 |         random_buffer_powers_of_two_up_to_4096_power_law[i & (RANDOM_BUFFER_SIZE - 1)]);
 579 | }
 580 | 
 581 | static void test_memset_mixed_power_law_word_aligned(int i) {
 582 |     memset_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 583 |         random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 584 |         random_buffer_multiples_of_four_up_to_1024_power_law[i & (RANDOM_BUFFER_SIZE - 1)]);
 585 | }
 586 | 
 587 | static void test_memset_mixed_power_law_unaligned(int i) {
 588 |     memset_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 589 |         random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 590 |         random_buffer_up_to_1023_power_law[i & (RANDOM_BUFFER_SIZE - 1)]);
 591 | }
 592 | 
 593 | static void test_memset_aligned_4(int i) {
 594 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 595 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 596 |         4);
 597 | }
 598 | 
 599 | static void test_memset_aligned_8(int i) {
 600 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 601 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 602 |         8);
 603 | }
 604 | 
 605 | static void test_memset_aligned_16(int i) {
 606 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 607 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 608 |         16);
 609 | }
 610 | 
 611 | static void test_memset_aligned_28(int i) {
 612 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 613 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 614 |         28);
 615 | }
 616 | 
 617 | static void test_memset_aligned_32(int i) {
 618 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 619 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 620 |         32);
 621 | }
 622 | 
 623 | static void test_memset_aligned_64(int i) {
 624 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 625 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 626 |         64);
 627 | }
 628 | 
 629 | static void test_memset_various_aligned_64(int i) {
 630 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32 + test_alignment,
 631 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 632 |         64);
 633 | }
 634 | 
 635 | static void test_memset_aligned_80(int i) {
 636 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 637 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 638 |         80);
 639 | }
 640 | 
 641 | static void test_memset_aligned_92(int i) {
 642 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 643 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 644 |         92);
 645 | }
 646 | 
 647 | static void test_memset_aligned_128(int i) {
 648 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 649 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 650 |         128);
 651 | }
 652 | 
 653 | static void test_memset_aligned_256(int i) {
 654 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4,
 655 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 656 |         256);
 657 | }
 658 | 
 659 | static void test_memset_unaligned_random_3(int i) {
 660 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 661 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 662 |         3);
 663 | }
 664 | 
 665 | static void test_memset_unaligned_random_8(int i) {
 666 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 667 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 668 |         8);
 669 | }
 670 | 
 671 | static void test_memset_unaligned_random_17(int i) {
 672 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 673 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 674 |         17);
 675 | }
 676 | 
 677 | static void test_memset_unaligned_random_28(int i) {
 678 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 679 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 680 |         28);
 681 | }
 682 | 
 683 | static void test_memset_unaligned_random_64(int i) {
 684 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 685 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 686 |         64);
 687 | }
 688 | 
 689 | static void test_memset_unaligned_random_137(int i) {
 690 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 691 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 692 |         137);
 693 | }
 694 | 
 695 | static void test_memset_unaligned_random_1023(int i) {
 696 |     memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)],
 697 |         random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF,
 698 |         1023);
 699 | }
 700 | 
 701 | static void clear_data_cache() {
 702 |     int val = 0;
 703 |     for (int i = 0; i < 1024 * 1024 * 32; i += 4) {
 704 |         val += buffer_alloc[i];
 705 |     }
 706 |     for (int i = 0; i < 1024 * 1024 * 32; i += 4) {
 707 |         buffer_alloc[i] = val;
 708 |     }
 709 | }
 710 | 
 711 | static void do_test(const char *name, void (*test_func)(int), int bytes) {
 712 |     int nu_iterations;
 713 |     if (bytes >= 1024) 
 714 |         nu_iterations = (64 * 1024 * 1024) / bytes;
 715 |     else if (bytes >= 64)
 716 |         nu_iterations = (16 * 1024 * 1024) / bytes;
 717 |     else
 718 |         nu_iterations = 1024 * 1024 / 2;
 719 |     /* Warm-up. */
 720 |     clear_data_cache();
 721 |     double temp_time = get_time();
 722 |     for (int i = 0; i < nu_iterations; i++)
 723 |        test_func(i);
 724 |     usleep(100000);
 725 |     double start_time = get_time();
 726 |     double end_time;
 727 |     int count = 0;
 728 |     for (;;) {
 729 |         for (int i = 0; i < nu_iterations; i++)
 730 |             test_func(i);
 731 |         count++;
 732 |         end_time = get_time();
 733 |         if (end_time - start_time >= test_duration)
 734 |             break;
 735 |     }
 736 |     double bandwidth = (double)bytes * nu_iterations * count / (1024 * 1024)
 737 |         / (end_time - start_time);
 738 |     printf("%s: %.2lf MB/s\n", name, bandwidth);
 739 | }
 740 | 
 741 | static void do_test_all(const char *name, void (*test_func)(), int bytes) {
 742 |     for (int j = 0; j < NU_MEMCPY_VARIANTS; j++)
 743 |         if (memcpy_mask[j]) {
 744 |             printf("%s:\n", memcpy_variant_name[j]);
 745 |             memcpy_func = memcpy_variant[j];
 746 |             do_test(name, test_func, bytes);
 747 |         }
 748 | }
 749 | 
 750 | static void fill_buffer(uint8_t *buffer) {
 751 |     uint32_t v = 0xEEAAEEAA;
 752 |     for (int i = 0; i < 1024 * 1024 * 16; i++) {
 753 |         buffer[i] = (v >> 24);
 754 |         v += i ^ 0x12345678;
 755 |     }
 756 | }
 757 | 
 758 | static int compare_buffers(uint8_t *buffer0, uint8_t *buffer1) {
 759 |     int identical = 1;
 760 |     int count = 0;
 761 |     for (int i = 0; i < 1024 * 1024 * 16; i++) {
 762 |         if (buffer0[i] != buffer1[i]) {
 763 |             count++;
 764 | 	    if (count < 10) {
 765 |                 printf("Byte at offset %d (0x%08X) doesn't match.\n",
 766 |                     i, i);
 767 |                 identical = 0;
 768 |             }
 769 | 	}
 770 |     }
 771 |     if (count >= 10) {
 772 |         printf("(%d more non-matching bytes present.)\n", count - 9);
 773 |     }
 774 |     return identical;
 775 | }
 776 | 
 777 | static void memcpy_emulate(uint8_t *dest, uint8_t *src, int size) {
 778 |     for (int i = 0; i < size; i++)
 779 |         dest[i] = src[i];
 780 | }
 781 | 
 782 | static void do_validation(int repeat) {
 783 |     int passed = 1;
 784 |     for (int i = 0; i < 10 * repeat; i++)  {
 785 |         int size, source, dest;
 786 |             size = floor(pow(2.0, (double)rand() * 20.0 / RAND_MAX));
 787 |             source = rand() % (1024 * 1024 * 16 + 1 - size);
 788 |             int aligned = 0;
 789 |             if ((rand() & 3) == 0) {
 790 | 		aligned = 1;
 791 |                 source &= ~3;
 792 |                 size = (size + 3) & (~3);
 793 |             }
 794 |             do {
 795 |                 dest = rand() % (1024 * 1024 * 16 + 1 - size);
 796 |                 if (aligned)
 797 |                     dest &= ~3;
 798 |             }
 799 |             while (dest + size > source && dest < source + size);
 800 |         printf("Testing (source offset = 0x%08X, destination offset = 0x%08X, size = %d).\n",
 801 |                 source, dest, size);
 802 |         fflush(stdout);
 803 |         fill_buffer(buffer_compare);
 804 |         memcpy_emulate(buffer_compare + dest, buffer_compare + source, size);
 805 |         fill_buffer(buffer_page);
 806 |         if (memcpy_func(buffer_page + dest, buffer_page + source, size) != buffer_page + dest) {
 807 |             printf("Validation failed: function did not return original destination address.\n");
 808 |             passed = 0;
 809 |         }
 810 |         if (!compare_buffers(buffer_page, buffer_compare)) {
 811 |             printf("Validation failed (source offset = 0x%08X, destination offset = 0x%08X, size = %d).\n",
 812 |                 source, dest, size);
 813 |             passed = 0;
 814 |         }
 815 |     }
 816 |     if (passed) {
 817 |         printf("Passed.\n");
 818 |     }
 819 | }
 820 | 
 821 | static void memset_emulate(uint8_t *dest, int c, int size) {
 822 |     for (int i = 0; i < size; i++)
 823 |         dest[i] = c;
 824 | }
 825 | 
 826 | static void do_validation_memset(int repeat) {
 827 |     int passed = 1;
 828 |     for (int i = 0; i < 10 * repeat; i++)  {
 829 |         int size, dest, c;
 830 |         size = floor(pow(2.0, (double)rand() * 20.0 / RAND_MAX));
 831 |         dest = rand() % (1024 * 1024 * 16 + 1 - size);
 832 |         c = rand() & 0xFF;
 833 |         printf("Testing (destination offset = 0x%08X, byte = %d, size = %d).\n",
 834 |                 dest, c, size);
 835 |         fflush(stdout);
 836 |         fill_buffer(buffer_compare);
 837 |         memset_emulate(buffer_compare + dest, c, size);
 838 |         fill_buffer(buffer_page);
 839 |         if (memset_func(buffer_page + dest, c, size) != buffer_page + dest) {
 840 |                 printf("Validation failed: function did not return original destination address.\n");
 841 |                 passed = 0;
 842 |         }
 843 |         if (!compare_buffers(buffer_page, buffer_compare)) {
 844 |             printf("Validation failed (destination offset = 0x%08X, size = %d).\n",
 845 |                 dest, size);
 846 |             passed = 0;
 847 |         }
 848 |     }
 849 |     if (passed) {
 850 |         printf("Passed.\n");
 851 |     }
 852 | }
 853 | 
 854 | #define NU_TESTS 48
 855 | 
 856 | typedef struct {
 857 |     const char *name;
 858 |     void (*test_func)();
 859 |     int bytes;
 860 | } test_t;
 861 | 
 862 | static test_t test[NU_TESTS] = {
 863 |     { "Mixed powers of 2 from 4 to 4096 (power law), word aligned", test_mixed_powers_of_two_word_aligned, 32768 },
 864 |     { "Mixed multiples of 4 from 4 to 1024 (power law), word aligned", test_mixed_power_law_word_aligned, 32768 },
 865 |     { "Mixed from 1 to 1023 (power law), unaligned", test_mixed_power_law_unaligned, 32768 },
 866 |     { "4 bytes word aligned", test_aligned_4, 4 },
 867 |     { "8 bytes word aligned", test_aligned_8, 8 },
 868 |     { "16 bytes word aligned", test_aligned_16, 16 },
 869 |     { "28 bytes word aligned", test_aligned_28, 28 },
 870 |     { "32 bytes word aligned", test_aligned_32, 32 },
 871 |     { "64 bytes word aligned", test_aligned_64, 64 },
 872 |     { "128 bytes word aligned", test_aligned_128, 128 },
 873 |     { "256 bytes word aligned", test_aligned_256, 256 },
 874 |     { "3 bytes randomly aligned", test_unaligned_random_3, 3 },
 875 |     { "8 bytes randomly aligned", test_unaligned_random_8, 8 },
 876 |     { "17 bytes randomly aligned", test_unaligned_random_17, 17 },
 877 |     { "28 bytes randomly aligned", test_unaligned_random_28, 28 },
 878 |     { "64 bytes randomly aligned", test_unaligned_random_64, 64 },
 879 |     { "137 bytes randomly aligned", test_unaligned_random_137, 137 },
 880 |     { "1024 bytes randomly aligned", test_unaligned_random_1024, 1024 },
 881 |     { "32768 bytes randomly aligned", test_unaligned_random_32768, 32768 },
 882 |     { "1M bytes randomly aligned", test_unaligned_random_1M, 1024 * 1024 },
 883 |     { "64 bytes randomly aligned, source aligned with dest",
 884 |         test_source_dest_aligned_random_64, 64 },
 885 |     { "1024 bytes randomly aligned, source aligned with dest",
 886 |         test_source_dest_aligned_random_1024, 1024 },
 887 |     { "32768 bytes randomly aligned, source aligned with dest",
 888 |         test_source_dest_aligned_random_32768, 32768 },
 889 |     { "1M bytes randomly aligned, source aligned with dest",
 890 |         test_source_dest_aligned_random_1M, 1024 *1024 },
 891 |     { "Up to 1024 bytes randomly aligned", test_random_mixed_sizes_1024, 512 },
 892 |     { "Up to 64 bytes randomly aligned", test_random_mixed_sizes_64, 32 },
 893 |     { "Up to 1024 bytes randomly aligned (DRAM)", test_random_mixed_sizes_DRAM_1024,
 894 |        512 },
 895 |     { "Up to 64 bytes randomly aligned (DRAM)", test_random_mixed_sizes_DRAM_64,
 896 |        32 },
 897 |     { "Up to 1024 bytes word aligned (DRAM)", test_random_mixed_sizes_DRAM_word_aligned_1024,
 898 |        514 },
 899 |     { "Up to 256 bytes word aligned (DRAM)", test_random_mixed_sizes_DRAM_word_aligned_256,
 900 |        130 },
 901 |     { "Up to 64 bytes word aligned (DRAM)", test_random_mixed_sizes_DRAM_word_aligned_64,
 902 |        34 },
 903 |     { "28 bytes 4-byte aligned", test_word_aligned_28, 28 },
 904 |     { "64 bytes 4-byte aligned", test_word_aligned_64, 64 },
 905 |     { "296 bytes 4-byte aligned", test_word_aligned_296, 296 },
 906 |     { "1024 bytes 4-byte aligned", test_word_aligned_1024, 1024 },
 907 |     { "4096 bytes 4-byte aligned", test_word_aligned_4096, 4096 },
 908 |     { "32768 bytes 4-byte aligned", test_word_aligned_32768, 32768 },
 909 |     { "64 bytes 32-byte aligned", test_chunk_aligned_64, 64 },
 910 |     { "296 bytes 32-byte aligned", test_chunk_aligned_296, 296 },
 911 |     { "1024 bytes 32-byte aligned", test_chunk_aligned_1024, 1024 },
 912 |     { "4096 bytes 32-byte aligned", test_chunk_aligned_4096, 4096 },
 913 |     { "32768 bytes 32-byte aligned", test_chunk_aligned_32768, 32768 },
 914 |     { "1024 bytes page aligned", test_page_aligned_1024, 1024 },
 915 |     { "4096 bytes page aligned", test_page_aligned_4096, 4096 },
 916 |     { "32768 bytes page aligned", test_page_aligned_32768, 32768 },
 917 |     { "256K bytes page aligned", test_page_aligned_256K, 256 * 1024 },
 918 |     { "1M bytes page aligned", test_page_aligned_1M, 1024 * 1024 },
 919 |     { "8M bytes page aligned", test_page_aligned_8M, 8 * 1024 * 1024 },
 920 | };
 921 | 
 922 | #define NU_MEMSET_TESTS 23
 923 | 
 924 | static test_t memset_test[NU_MEMSET_TESTS] = {
 925 |     { "Mixed powers of 2 from 4 to 4096 (power law), word aligned", test_memset_mixed_powers_of_two_word_aligned, 2048 },
 926 |     { "Mixed multiples of 4 from 4 to 1024 (power law), word aligned", test_memset_mixed_power_law_word_aligned, 512 },
 927 |     { "Mixed from 1 to 1023 (power law), unaligned", test_memset_mixed_power_law_unaligned, 512 },
 928 |     { "1024 bytes page aligned", test_memset_page_aligned_1024, 1024 },
 929 |     { "4096 bytes page aligned", test_memset_page_aligned_4096, 4096 },
 930 |     { "4 bytes word aligned", test_memset_aligned_4, 4 },
 931 |     { "8 bytes word aligned", test_memset_aligned_8, 8 },
 932 |     { "16 bytes word aligned", test_memset_aligned_16, 16 },
 933 |     { "28 bytes word aligned", test_memset_aligned_28, 28 },
 934 |     { "32 bytes word aligned", test_memset_aligned_32, 32 },
 935 |     { "64 bytes word aligned", test_memset_aligned_64, 64 },
 936 |     { "64 bytes various alignments word aligned (multi-test)", test_memset_various_aligned_64, 64 },
 937 |     { "80 bytes word aligned", test_memset_aligned_80, 80 },
 938 |     { "92 bytes word aligned", test_memset_aligned_92, 92 },
 939 |     { "128 bytes word aligned", test_memset_aligned_128, 128 },
 940 |     { "256 bytes word aligned", test_memset_aligned_256, 256 },
 941 |     { "3 bytes randomly aligned", test_memset_unaligned_random_3, 3 },
 942 |     { "8 bytes randomly aligned", test_memset_unaligned_random_8, 8 },
 943 |     { "17 bytes randomly aligned", test_memset_unaligned_random_17, 17 },
 944 |     { "28 bytes randomly aligned", test_memset_unaligned_random_28, 28 },
 945 |     { "64 bytes randomly aligned", test_memset_unaligned_random_64, 64 },
 946 |     { "137 bytes randomly aligned", test_memset_unaligned_random_137, 137 },
 947 |     { "1023 bytes randomly aligned", test_memset_unaligned_random_1023, 1023 },
 948 | };
 949 | 
 950 | static void usage() {
 951 |             printf("Commands:\n"
 952 |                 "--list          List test numbers and memcpy variants.\n"
 953 |                 "--test <number> Perform test <number> only, 5 times for each memcpy variant.\n"
 954 |                 "--all           Perform each test 5 times for each memcpy variant.\n"
 955 |                 "--help          Show this message.\n"
 956 |                 "Options:\n"
 957 |                 "--duration <n>  Sets the duration of each individual test. Default is 2 seconds.\n"
 958 |                 "--repeat <n>    Repeat each test n times. Default is 5.\n"
 959 |                 "--quick         Shorthand for --duration 1 -repeat 2.\n"
 960 |                 "--memcpy <list> Instead of testing all memcpy variants, test only the memcpy variants\n"
 961 |                 "                in <list>. <list> is a string of characters from a to h or higher, corresponding\n"
 962 |                 "                to each memcpy variant (for example, abcdef selects the first six variants).\n"
 963 |                 "--validate      Validate for correctness instead of measuring performance. The --repeat option\n"
 964 |                 "                can be used to influence the number of validation tests performed (default 5).\n"
 965 |                 );
 966 | }
 967 | 
 968 | static int char_to_memcpy_variant(char c) {
 969 |     if (c >= 'a' && c <= 'z')
 970 |         return c - 'a';
 971 |     if (c >= 'A' && c <= 'Z')
 972 |         return c - 'A' + 26;
 973 |     return - 1;
 974 | }
 975 | 
 976 | static char memcpy_variant_to_char(int i) {
 977 |     if (i < 26)
 978 |         return 'a' + i;
 979 |     return 'A' + i - 26;
 980 | }
 981 | 
 982 | int main(int argc, char *argv[]) {
 983 |     if (argc == 1) {
 984 |         usage();
 985 |         return 0;
 986 |     }
 987 |     int argi = 1;
 988 |     int command_test = - 1;
 989 |     int command_all = 0;
 990 |     int repeat = 5;
 991 |     int validate = 0;
 992 |     int memcpy_specified = 0;
 993 |     int memset_specified = 0;
 994 |     for (int i = 0; i < NU_MEMCPY_VARIANTS; i++)
 995 |         memcpy_mask[i] = 0;
 996 |     for (int i = 0; i < NU_MEMSET_VARIANTS; i++)
 997 |         memset_mask[i] = 0;
 998 |     for (;;) {
 999 |         if (argi >= argc)
1000 |             break;
1001 |         if (argi + 1 < argc && strcasecmp(argv[argi], "--test") == 0) {
1002 |             int t = atoi(argv[argi + 1]);
1003 |             if (t < 0 || t >= NU_TESTS) {
1004 |                 printf("Test out of range.\n");
1005 |                 return 1;
1006 |             }
1007 |             command_test = t;
1008 |             argi += 2;
1009 |             continue;
1010 |         }
1011 |         if (strcasecmp(argv[argi], "--quick") == 0) {
1012 |             test_duration = 1.0;
1013 |             repeat = 2;
1014 |             argi++;
1015 |             continue;
1016 |         }
1017 |         if (strcasecmp(argv[argi], "--all") == 0) {
1018 |             command_all = 1;
1019 |             argi++;
1020 |             continue;
1021 |         }
1022 |         if (strcasecmp(argv[argi], "--list") == 0) {
1023 |             printf("Tests (memcpy):\n");
1024 |             for (int i = 0; i < NU_TESTS; i++)
1025 |                 printf("%3d    %s\n", i, test[i].name);
1026 |             printf("Tests (memset):\n");
1027 |             for (int i = 0; i < NU_MEMSET_TESTS; i++)
1028 |                 printf("%3d    %s\n", i, memset_test[i].name);
1029 |             printf("memcpy variants:\n");
1030 |             for (int i = 0; i < NU_MEMCPY_VARIANTS; i++)
1031 |                 printf("  %c    %s\n", memcpy_variant_to_char(i), memcpy_variant_name[i]);
1032 |             printf("memset variants:\n");
1033 |             for (int i = 0; i < NU_MEMSET_VARIANTS; i++)
1034 |                 printf("  %c    %s\n", memcpy_variant_to_char(i), memset_variant_name[i]);
1035 |             return 0;
1036 |         }
1037 |         if (strcasecmp(argv[argi], "--help") == 0) {
1038 |             usage();
1039 |             return 0;
1040 |         }
1041 |         if (argi + 1 < argc && strcasecmp(argv[argi], "--duration") == 0) {
1042 |             double d = strtod(argv[argi + 1], NULL);
1043 |             if (d < 0.1 || d >= 100.0) {
1044 |                 printf("Duration out of range.\n");
1045 |                 return 1;
1046 |             }
1047 |             test_duration = d;
1048 |             argi += 2;
1049 |             continue;
1050 |         }
1051 |         if (argi + 1 < argc && strcasecmp(argv[argi], "--memcpy") == 0) {
1052 |             for (int i = 0; i < NU_MEMCPY_VARIANTS; i++)
1053 |                 memcpy_mask[i] = 0;
1054 |             for (int i = 0; i < strlen(argv[argi + 1]); i++)
1055 |                 if (char_to_memcpy_variant(argv[argi + 1][i]) >= 0 && char_to_memcpy_variant(argv[argi + 1][i]) < NU_MEMCPY_VARIANTS)
1056 |                     memcpy_mask[char_to_memcpy_variant(argv[argi + 1][i])] = 1;
1057 |             memcpy_specified = 1;
1058 |             argi += 2;
1059 |             continue;
1060 |         }
1061 |         if (argi + 1 < argc && strcasecmp(argv[argi], "--repeat") == 0) {
1062 |             repeat = atoi(argv[argi + 1]);
1063 |             if (repeat < 1 || repeat >= 1000) {
1064 |                 printf("Number of repeats out of range.\n");
1065 |                 return 1;
1066 |             }
1067 |             argi += 2;
1068 |             continue;
1069 |         }
1070 |         if (strcasecmp(argv[argi], "--validate") == 0) {
1071 |             validate = 1;
1072 |             argi++;
1073 |             continue;
1074 |         }
1075 |         if (argi + 1 < argc && strcasecmp(argv[argi], "--memset") == 0) {
1076 |             for (int i = 0; i < NU_MEMSET_VARIANTS; i++)
1077 |                 memset_mask[i] = 0;
1078 |             for (int i = 0; i < strlen(argv[argi + 1]); i++)
1079 |                 if (char_to_memcpy_variant(argv[argi + 1][i]) >= 0 && char_to_memcpy_variant(argv[argi + 1][i]) < NU_MEMSET_VARIANTS)
1080 |                     memset_mask[char_to_memcpy_variant(argv[argi + 1][i])] = 1;
1081 |             memset_specified = 1;
1082 |             argi += 2;
1083 |             continue;
1084 |         }
1085 |         printf("Unkown option. Try --help.\n");
1086 |         return 1;
1087 |     }
1088 | 
1089 |     if (memcpy_specified && memset_specified) {
1090 |         printf("Specify only one of --memcpy and --memset.\n");
1091 |         return 1;
1092 |     }
1093 | 
1094 |     if (command_test != -1 && memset_specified &&
1095 |     command_test >= NU_MEMSET_TESTS) {
1096 |         printf("Test out of range for memset.\n");
1097 |         return 1;
1098 |     }
1099 | 
1100 |     if ((command_test != -1) + command_all != 1 && !validate) {
1101 |         printf("Specify only one of --test and --all.\n");
1102 |         return 1;
1103 |     }
1104 | 
1105 |     buffer_alloc = malloc(1024 * 1024 * 32);
1106 |     buffer_page = (uint8_t *)buffer_alloc + ((4096 - ((uintptr_t)buffer_alloc & 4095))
1107 |         & 4095);
1108 |     buffer_chunk = buffer_page + 17 * 32;
1109 |     if (validate)
1110 |         buffer_compare = malloc(1024 * 1024 * 16);
1111 |     srand(0);
1112 |     random_buffer_1024 = malloc(sizeof(int) * RANDOM_BUFFER_SIZE);
1113 |     for (int i = 0; i < RANDOM_BUFFER_SIZE; i++)
1114 |         random_buffer_1024[i] = rand() % 1024;
1115 |     random_buffer_1M = malloc(sizeof(int) * RANDOM_BUFFER_SIZE);
1116 |     for (int i = 0; i < RANDOM_BUFFER_SIZE; i++)
1117 |         random_buffer_1M[i] = rand() % (1024 * 1024);
1118 |     random_buffer_powers_of_two_up_to_4096_power_law = malloc(sizeof(int) * RANDOM_BUFFER_SIZE);
1119 |     int random_buffer_powers_of_two_up_to_4096_power_law_total_bytes = 0;
1120 |     for (int i = 0; i < RANDOM_BUFFER_SIZE; i++) {
1121 |         int size = 4 << (int)floor(11.0 * pow(1.5, 10.0 * (double)rand() / RAND_MAX) / pow(1.5, 10.0));
1122 |         random_buffer_powers_of_two_up_to_4096_power_law[i] = size;
1123 |         random_buffer_powers_of_two_up_to_4096_power_law_total_bytes += size;
1124 |     }
1125 |     test[0].bytes = random_buffer_powers_of_two_up_to_4096_power_law_total_bytes / RANDOM_BUFFER_SIZE;
1126 |     memset_test[0].bytes = test[0].bytes;
1127 |     random_buffer_multiples_of_four_up_to_1024_power_law = malloc(sizeof(int) * RANDOM_BUFFER_SIZE);
1128 |     int random_buffer_multiples_of_four_up_to_1024_power_law_total_bytes = 0;
1129 |     for (int i = 0; i < RANDOM_BUFFER_SIZE; i++) {
1130 |         double f = (double)rand() / RAND_MAX;
1131 |         int size;
1132 |         if (f < 0.9)
1133 |             /* 90% in the range 4 to 256. */
1134 |             size = 4 + ((int)floor(252.0 *
1135 |                 (pow(1.0 + f / 0.9, 5.0) - 1.0) / (pow(2.0, 5.0) - 1.0)
1136 |                 ) & (~3));
1137 |         else
1138 |             /* 10% in the range 260 to 1024 */
1139 |             size = 4 + ((int)floor((1024 - 260.0) *
1140 |                 (pow(1.0 + (f - 0.9) / 0.1, 8.0) - 1.0) / (pow(2.0, 8.0) - 1.0)
1141 |                 ) & (~3));
1142 |         random_buffer_multiples_of_four_up_to_1024_power_law[i] = size;
1143 |         random_buffer_multiples_of_four_up_to_1024_power_law_total_bytes += size;
1144 |     }
1145 |     test[1].bytes = random_buffer_multiples_of_four_up_to_1024_power_law_total_bytes / RANDOM_BUFFER_SIZE;
1146 |     memset_test[1].bytes = test[1].bytes;
1147 |     random_buffer_up_to_1023_power_law = malloc(sizeof(int) * RANDOM_BUFFER_SIZE);
1148 |     int random_buffer_up_to_1023_power_law_total_bytes = 0;
1149 |     for (int i = 0; i < RANDOM_BUFFER_SIZE; i++) {
1150 |         int size;
1151 |         size = 1 + (int)floor(1024.0 * (pow(2.0, 10.0 * (double)rand() / RAND_MAX) - 1.0) / (pow(2.0, 10.0) - 1.0));
1152 |         random_buffer_up_to_1023_power_law[i] = size;
1153 |         random_buffer_up_to_1023_power_law_total_bytes += size;
1154 |     }
1155 |     test[2].bytes = random_buffer_up_to_1023_power_law_total_bytes / RANDOM_BUFFER_SIZE;
1156 |     memset_test[2].bytes = test[2].bytes;
1157 | 
1158 |     if (sizeof(size_t) != sizeof(int)) {
1159 |         printf("sizeof(size_t) != sizeof(int), unable to directly replace memcpy.\n");
1160 |         return 1;
1161 |     }
1162 | 
1163 |     int start_test, end_test;
1164 |     start_test = 0;
1165 |     if (memset_specified)
1166 |         end_test = NU_MEMSET_TESTS - 1;
1167 |     else
1168 |         end_test = NU_TESTS - 1;
1169 |     if (command_test != - 1) {
1170 |         start_test = command_test;
1171 |         end_test = command_test;
1172 |     }
1173 |     if (validate) {
1174 |         for (int j = 0; j < NU_MEMCPY_VARIANTS; j++)
1175 |             if (memcpy_mask[j]) {
1176 |                 printf("%s:\n", memcpy_variant_name[j]);
1177 |                 memcpy_func = memcpy_variant[j];
1178 |                 do_validation(repeat);
1179 |             }
1180 |         for (int j = 0; j < NU_MEMSET_VARIANTS; j++)
1181 |             if (memset_mask[j]) {
1182 |                 printf("%s:\n", memset_variant_name[j]);
1183 |                 memset_func = memset_variant[j];
1184 |                 do_validation_memset(repeat);
1185 |             }
1186 |         return 0;
1187 |     }
1188 |     if (!memcpy_specified)
1189 |         goto skip_memcpy_test;
1190 |     for (int t = start_test; t <= end_test; t++) {
1191 |         for (int j = 0; j < NU_MEMCPY_VARIANTS; j++)
1192 |             if (memcpy_mask[j]) {
1193 |                 printf("%s:\n", memcpy_variant_name[j]);
1194 |                 memcpy_func = memcpy_variant[j];
1195 |                 for (int i = 0; i < repeat; i++)
1196 |                     do_test(test[t].name, test[t].test_func, test[t].bytes);
1197 |             }
1198 |     }
1199 | skip_memcpy_test:
1200 |     if (!memset_specified)
1201 |         goto skip_memset_test;
1202 |     for (int t = start_test; t <= end_test; t++) {
1203 |         if (t == 11) {
1204 |             for (test_alignment = 0; test_alignment < 32; test_alignment += 4) {
1205 |                 char test_name[128];
1206 |                 sprintf(test_name, "%s (alignment %d)", memset_test[t].name,
1207 |                     test_alignment);
1208 |                 for (int j = 0; j < NU_MEMSET_VARIANTS; j++)
1209 |                     if (memset_mask[j]) {
1210 |                         printf("%s:\n", memset_variant_name[j]);
1211 |                         memset_func = memset_variant[j];
1212 |                         for (int i = 0; i < repeat; i++)
1213 |                             do_test(test_name, memset_test[t].test_func, memset_test[t].bytes);
1214 |                     }
1215 |             }
1216 |             continue;
1217 |         }
1218 |         for (int j = 0; j < NU_MEMSET_VARIANTS; j++)
1219 |             if (memset_mask[j]) {
1220 |                 printf("%s:\n", memset_variant_name[j]);
1221 |                 memset_func = memset_variant[j];
1222 |                 for (int i = 0; i < repeat; i++)
1223 |                     do_test(memset_test[t].name, memset_test[t].test_func, memset_test[t].bytes);
1224 |             }
1225 |     }
1226 | skip_memset_test:
1227 |     exit(0);
1228 | }
1229 | 


--------------------------------------------------------------------------------
/memcpy-hybrid.S:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2010-2011, Linaro Limited
  2 |    All rights reserved.
  3 | 
  4 |    Redistribution and use in source and binary forms, with or without
  5 |    modification, are permitted provided that the following conditions
  6 |    are met:
  7 | 
  8 |       * Redistributions of source code must retain the above copyright
  9 |       notice, this list of conditions and the following disclaimer.
 10 | 
 11 |       * Redistributions in binary form must reproduce the above copyright
 12 |       notice, this list of conditions and the following disclaimer in the
 13 |       documentation and/or other materials provided with the distribution.
 14 | 
 15 |       * Neither the name of Linaro Limited nor the names of its
 16 |       contributors may be used to endorse or promote products derived
 17 |       from this software without specific prior written permission.
 18 | 
 19 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 |    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 |    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 |    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 |    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 |    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 |    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 |    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 |    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | 
 31 |    Written by Dave Gilbert <david.gilbert@linaro.org>
 32 | 
 33 |    This memcpy routine is optimised on a Cortex-A9 and should work on
 34 |    all ARMv7 processors with NEON. */
 35 | 
 36 | /* Modified:
 37 |  * Change preload offset to 192.
 38 |  */
 39 | #define PRELOAD_OFFSET 192
 40 | 
 41 | @ 2011-09-01 david.gilbert@linaro.org
 42 | @    Extracted from local git 2f11b436
 43 | 
 44 | 	.syntax unified
 45 | 	.arch armv7-a
 46 | 	.fpu neon
 47 | 
 48 | @ this lets us check a flag in a 00/ff byte easily in either endianness
 49 | #ifdef __ARMEB__
 50 | #define CHARTSTMASK(c) 1<<(31-(c*8))
 51 | #else
 52 | #define CHARTSTMASK(c) 1<<(c*8)
 53 | #endif
 54 | 	.text
 55 | 	.thumb
 56 | 
 57 | @ ---------------------------------------------------------------------------
 58 | 	.thumb_func
 59 | 	.align 2
 60 | 	.p2align 4,,15
 61 | 	.global memcpy_hybrid
 62 | 	.type memcpy_hybrid,%function
 63 | memcpy_hybrid:
 64 | 	@ r0 = dest
 65 | 	@ r1 = source
 66 | 	@ r2 = count
 67 | 	@ returns dest in r0
 68 | 	@ Overlaps of source/dest not allowed according to spec
 69 | 	@ Note this routine relies on v7 misaligned loads/stores
 70 | 	pld	[r1]
 71 | 	mov	r12, r0		@ stash original r0
 72 | 	cmp	r2,#32
 73 | 	blt	10f		@ take the small copy case separately
 74 | 
 75 | 	@ test for either source or destination being misaligned
 76 | 	@ (We only rely on word align)
 77 | 	tst	r0,#3
 78 | 	it	eq
 79 | 	tsteq	r1,#3
 80 | 	bne	30f		@ misaligned case
 81 | 
 82 | 4:
 83 | 	@ at this point we are word (or better) aligned and have at least
 84 | 	@ 32 bytes to play with
 85 | 
 86 | 	@ If it's a huge copy,  try Neon
 87 | 	cmp	r2, #128*1024
 88 | 	bge	35f		@ Sharing general non-aligned case here, aligned could be faster
 89 | 	
 90 | 	push	{r3,r4,r5,r6,r7,r8,r10,r11}
 91 | 5:
 92 | 	ldmia	r1!,{r3,r4,r5,r6,r7,r8,r10,r11}
 93 | 	sub	r2,r2,#32
 94 | 	pld	[r1,#PRELOAD_OFFSET]
 95 | 	cmp	r2,#32
 96 | 	stmia	r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
 97 | 	bge	5b
 98 | 
 99 | 	pop	{r3,r4,r5,r6,r7,r8,r10,r11}
100 | 	@ We are now down to less than 32 bytes
101 | 	cbz	r2,15f		@ quick exit for the case where we copied a multiple of 32
102 | 
103 | 10:  @ small copies (not necessarily aligned - note might be slightly more than 32bytes)
104 | 	cmp	r2,#4
105 | 	blt	12f
106 | 11:
107 | 	sub	r2,r2,#4
108 | 	cmp	r2,#4
109 | 	ldr	r3, [r1],#4
110 | 	str	r3, [r0],#4
111 | 	bge	11b
112 | 12:
113 | 	tst	r2,#2
114 | 	itt	ne
115 | 	ldrhne	r3, [r1],#2
116 | 	strhne	r3, [r0],#2
117 | 	
118 | 	tst	r2,#1
119 | 	itt	ne
120 | 	ldrbne	r3, [r1],#1
121 | 	strbne	r3, [r0],#1
122 | 	
123 | 15:  @ exit
124 | 	mov	r0,r12		@ restore r0
125 | 	bx	lr
126 | 
127 | 	.align 2
128 | 	.p2align 4,,15
129 | 30:  @ non-aligned - at least 32 bytes to play with
130 | 	@ Test for co-misalignment
131 | 	eor	r3, r0, r1
132 | 	tst	r3,#3
133 | 	beq	50f
134 | 
135 | 	@ Use Neon for misaligned
136 | 35:
137 | 	vld1.8	{d0,d1,d2,d3}, [r1]!
138 | 	sub	r2,r2,#32
139 | 	cmp	r2,#32
140 | 	pld	[r1,#PRELOAD_OFFSET]
141 | 	vst1.8	{d0,d1,d2,d3}, [r0]!
142 | 	bge	35b
143 | 	b	10b		@ TODO: Probably a bad idea to switch to ARM at this point
144 | 
145 | 	.align 2
146 | 	.p2align 4,,15
147 | 50: @ Co-misaligned
148 | 	@ At this point we've got at least 32 bytes
149 | 51:
150 | 	ldrb	r3,[r1],#1
151 | 	sub	r2,r2,#1
152 | 	strb	r3,[r0],#1
153 | 	tst	r0,#7
154 | 	bne	51b
155 | 
156 | 	cmp	r2,#32
157 | 	blt	10b
158 | 	b	4b
159 | 


--------------------------------------------------------------------------------
/memcpy-hybrid.h:
--------------------------------------------------------------------------------
1 | 
2 | extern void *memcpy_hybrid(void *dest, const void *src, size_t n);
3 | 


--------------------------------------------------------------------------------
/new_arm.S:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * Copyright 2013 Harm Hanemaaijer <fgenfb@yahoo.com>
   3 |  *
   4 |  * Permission is hereby granted, free of charge, to any person obtaining a
   5 |  * copy of this software and associated documentation files (the "Software"),
   6 |  * to deal in the Software without restriction, including without limitation
   7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 |  * and/or sell copies of the Software, and to permit persons to whom the
   9 |  * Software is furnished to do so, subject to the following conditions:
  10 |  *
  11 |  * The above copyright notice and this permission notice (including the next
  12 |  * paragraph) shall be included in all copies or substantial portions of the
  13 |  * Software.
  14 |  *
  15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21 |  * DEALINGS IN THE SOFTWARE.
  22 |  *
  23 |  */
  24 | 
  25 | #ifdef CONFIG_THUMB
  26 | #define W(instr) instr.w
  27 | #define THUMB(instr...)	instr
  28 | #define ARM(instr...)
  29 | #else
  30 | #define W(instr) instr
  31 | #define THUMB(instr...)
  32 | #define ARM(instr...) instr
  33 | #endif
  34 | 
  35 | /*
  36 |  * In practice, because the way NEON is configured on most systems,
  37 |  * specifying alignment hints for NEON instructions doesn't seem
  38 |  * to improve performance, or even degrade performance in some cases.
  39 |  * However, actually having the address aligned to an element
  40 |  * boundary or greater is beneficial.
  41 |  */
  42 | #define NEON_ALIGN(n)
  43 | /* #define NEON_ALIGN(n) :n */
  44 | 
  45 | /* Prevent the stack from becoming executable */
  46 | #if defined(__linux__) && defined(__ELF__)
  47 | .section .note.GNU-stack,"",%progbits
  48 | #endif
  49 | 
  50 | .text
  51 | .syntax unified
  52 | .arch armv7a
  53 | .fpu neon
  54 | 
  55 | .macro asm_function function_name
  56 |     .global \function_name
  57 | .func \function_name
  58 | .type \function_name, function
  59 | ARM(    .p2align 5      )
  60 | THUMB(  .p2align 2      )
  61 | \function_name:
  62 | .endm
  63 | 
  64 | /*
  65 |  * The following memcpy implementation is optimized with a fast path
  66 |  * for common, word aligned cases and optionally use unaligned access for
  67 |  * small sizes.
  68 |  *
  69 |  * - line_size is the cache line size used for prefetches. Must be 64 or 32.
  70 |  * - prefetch_distance is the number of cache lines to look ahead and must be
  71 |  *   >= 2.
  72 |  * - write_align is the write alignment enforced before the main loop for larger
  73 |  *   sizes (word aligned case) and must be 0, 16, 32, or 64.
  74 |  * - aligned_access must be 0 or 1. When enabled, no unaligned memory accesses
  75 |  *   will occur. Both small size tresholds for unaligned access are not used
  76 |  *   in this case.
  77 |  */
  78 | 
  79 | /* The threshold size for using the fast path for the word-aligned case. */
  80 | #define FAST_PATH_THRESHOLD 256
  81 | /* The threshold size for using the small size path for the word-aligned case. */
  82 | #define SMALL_SIZE_THRESHOLD 15
  83 | /*
  84 |  * The threshold size for using the small size path for the unaligned case.
  85 |  * Unaligned memory accesses will be generated for requests smaller or equal to
  86 |  * this size.
  87 |  */
  88 | #define UNALIGNED_SMALL_SIZE_THRESHOLD 64
  89 | /*
  90 |  * The threshold size for using the small size path when both the source and
  91 |  * the destination are unaligned. Unaligned memory accesses will be generated
  92 |  * for requests smaller of equal to this size.
  93 |  */
  94 | #define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 32
  95 | 
  96 | /*
  97 |  * For a code-reduced version, define all four of the above constants to 0,
  98 |  * eliminating the fast path and small size special cases. With Thumb2
  99 |  * enabled, this resulted in a reduction in code size from 1150 to 824 bytes,
 100 |  * at the cost of lower performance for smaller sizes.
 101 |  */
 102 | // #define FAST_PATH_THRESHOLD 0
 103 | // #define SMALL_SIZE_THRESHOLD 0
 104 | // #define UNALIGNED_SMALL_SIZE_THRESHOLD 0
 105 | // #define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 0
 106 | 
 107 | /*
 108 |  * EARLY_PREFETCHES is used in the fast path implementation.
 109 |  * The optimal value for EARLY_PREFETCHES was determined empirically.
 110 |  * It is equal to prefetch_distance + 1 for line_size 32.
 111 |  *            and prefetch_distance - 1 for line_size 64.
 112 |  */
 113 | #define EARLY_PREFETCHES (\prefetch_distance - (\line_size / 32) * 2 + 3)
 114 | 
 115 | #if FAST_PATH_THRESHOLD > 0
 116 | #define FAST_PATH(instr...) instr
 117 | #define NO_FAST_PATH(instr...)
 118 | #else
 119 | #define FAST_PATH(instr...)
 120 | #define NO_FAST_PATH(instr...) instr
 121 | #endif
 122 | 
 123 | 
 124 | /* Helper macro for the fast-path implementation. */
 125 | 
 126 | .macro copy_16_bytes bytes_to_go, line_size, prefetch_distance
 127 | #ifdef CONFIG_THUMB
 128 |                 /*
 129 |                  * When Thumb2 mode is enabled, the ldmia/stmia instructions
 130 |                  * will be 16-bit, and the preload instruction will be
 131 |                  * 32-bit, so we only need one 32-bit wide nop instruction
 132 |                  * when there's no preload, for a total size of two words.
 133 |                  */
 134 | .if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \
 135 |  (\bytes_to_go % \line_size) == 0
 136 |         	pld     [r1, ip]
 137 |                 ldmia   r1!, {r3, r4, r5, r6}
 138 |                 stmia   r0!, {r3, r4, r5, r6}
 139 | .else
 140 |                 ldmia   r1!, {r3, r4, r5, r6}
 141 |         W(	nop	)
 142 |                 stmia   r0!, {r3, r4, r5, r6}
 143 | .endif
 144 | #else
 145 |                 /*
 146 |                  * When ARM mode is enabled, every instruction is one word,
 147 |                  * so make sure the entire block is four instructions.
 148 |                  */
 149 | .if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \
 150 | (\bytes_to_go % \line_size) == 0
 151 |        		pld     [r1, ip]
 152 | .else
 153 |                 nop
 154 | .endif
 155 |                 ldmia   r1!, {r3, r4, r5, r6}
 156 |                 nop
 157 |                 stmia   r0!, {r3, r4, r5, r6}
 158 | #endif
 159 | .endm
 160 | 
 161 | 
 162 | /* Helper macro implementing unaligned copy. */
 163 | 
 164 | .macro unaligned_copy shift, line_size, prefetch_distance, write_align, \
 165 | aligned_access
 166 | 		/*
 167 | 		 * ip is the aligned source base address.
 168 | 		 * r3 is a word of data from the source.
 169 | 		 */
 170 | .if \write_align > 0
 171 | 		cmp	r2, #(32 + \write_align - 4)
 172 | .else
 173 | 		cmp	r2, #32
 174 | .endif
 175 | 		push 	{r5}
 176 | 		blt	55f
 177 | 		subs	r2, r2, #32
 178 | 
 179 | 		/* Handle write alignment. */
 180 | .if \write_align > 0
 181 | .if \write_align == 8
 182 | 		tst	r0, #4
 183 | 	        mov	r4, r3, lsr #\shift
 184 |                 ldrne	r3, [r1], #4
 185 | 		subne	r2, r2, #4
 186 |                 orrne   r4, r4, r3, lsl #(32 - \shift)
 187 |                 strne   r4, [r0], #4
 188 | .else
 189 | 		ands	r5, r0, #(\write_align - 1)
 190 | 		rsb	r5, r5, #\write_align
 191 | 		beq	59f
 192 | 		sub	r2, r2, r5
 193 | 
 194 | 58:             movs    r4, r3, lsr #\shift
 195 |                 ldr	r3, [r1], #4
 196 |                 subs    r5, r5, #4
 197 |                 orr     r4, r4, r3, lsl #(32 - \shift)
 198 |                 str     r4, [r0], #4
 199 |                 bgt     58b
 200 | 59:
 201 | .endif
 202 | .endif
 203 | 
 204 |                 /*
 205 |                  * Assume a preload at aligned base + line_size will
 206 |                  * be useful.
 207 |                  */
 208 | 		pld     [ip, #\line_size]
 209 | 		push	{r6-r11}
 210 | 		mov	r11, r3
 211 | 
 212 | 		mov	r4, ip
 213 |                 add     r5, r1, #(\prefetch_distance * \line_size)
 214 |                 subs    r2, r2, #(\prefetch_distance * \line_size)
 215 |                 bic     r3, r5, #31
 216 |                 add     r4, r4, #(2 * \line_size)
 217 |                 blt     54f
 218 |                 cmp     r4, r3
 219 |                 sub     ip, r3, r1
 220 |                 /*
 221 |                  * "Catch-up" the early preloads (which have been performed up
 222 |                  * to aligned source address + line_size) to the preload offset
 223 |                  * used in the main loop.
 224 |                  */
 225 |                 bge     52f
 226 | 51:             adds    r4, r4, #\line_size		/* Thumb16 */
 227 |                 cmp     r4, r3
 228 |                 pld     [r4, #(- \line_size)]
 229 |                 blt     51b
 230 | 52:
 231 | 		/*
 232 | 		 * Note that when L1_CACHE_BYTES is 64, we are
 233 | 		 * prefetching every 32 bytes. Although not optimal
 234 | 		 * there doesn't seem to be big penalty for the extra
 235 | 		 * preload instructions and it prevents greater
 236 | 		 * code size and complexity.
 237 | 		 */
 238 | 53:		pld	[r1, ip]
 239 | 54:
 240 | 		ldmia	r1!, {r4-r7}
 241 | 		mov	r3, r11, lsr #\shift
 242 | 		ldmia	r1!, {r8-r11}
 243 | 		orr	r3, r3, r4, lsl #(32 - \shift)
 244 | 		movs	r4, r4, lsr #\shift	/* Thumb16 */
 245 | 		orr	r4, r4, r5, lsl #(32 - \shift)
 246 | 		movs	r5, r5, lsr #\shift	/* Thumb16 */
 247 | 		orr	r5, r5, r6, lsl #(32 - \shift)
 248 | 		movs	r6, r6, lsr #\shift	/* Thumb16 */
 249 | 		orr	r6, r6, r7, lsl #(32 - \shift)
 250 | 		movs	r7, r7, lsr #\shift	/* Thumb16 */
 251 | 		orr	r7, r7, r8, lsl #(32 - \shift)
 252 | 		mov	r8, r8, lsr #\shift
 253 | 		orr	r8, r8, r9, lsl #(32 - \shift)
 254 | 		mov	r9, r9, lsr #\shift
 255 | 		orr	r9, r9, r10, lsl #(32 - \shift)
 256 | 		mov	r10, r10, lsr #\shift
 257 | 		orr	r10, r10, r11, lsl #(32 - \shift)
 258 | 		subs	r2, r2, #32
 259 | 		stmia	r0!, {r3-r10}
 260 | 		bge	53b
 261 | 		cmn	r2, #(\prefetch_distance * \line_size)
 262 | 		bge	54b
 263 | 		/* Correct the count. */
 264 | 		adds	r2, r2, #(\prefetch_distance * \line_size + 32)
 265 | 
 266 | 		mov	r3, r11
 267 | 		pop	{r6-r11}
 268 | 
 269 | 55:		bics	r5, r2, #3
 270 | 		beq	57f
 271 | 
 272 | 56:             movs    r4, r3, lsr #\shift
 273 |                 ldr	r3, [r1], #4
 274 |                 subs    r5, r5, #4
 275 |                 orr     r4, r4, r3, lsl #(32 - \shift)
 276 |                 str     r4, [r0], #4
 277 |                 bgt     56b
 278 | 
 279 | 57:		pop	{r5}
 280 | 		pop	{r4}
 281 | 		subs	r1, r1, #((32 - \shift) / 8)
 282 | .if \aligned_access == 1
 283 | 		b	7b
 284 | .else
 285 | 		b	3b
 286 | .endif
 287 | .endm
 288 | 
 289 | 
 290 | /* The main memcpy function macro. */
 291 | 
 292 | .macro memcpy_variant line_size, prefetch_distance, write_align, \
 293 | aligned_access
 294 | 
 295 | .if \aligned_access == 1
 296 | 		cmp	r2, #3
 297 | .else
 298 | NO_FAST_PATH(	cmp	r2, #3	)
 299 | .endif
 300 | 		orr	r3, r0, r1
 301 | .if \aligned_access == 1
 302 | 		push	{r0}
 303 | 		ble	7f
 304 | .else
 305 | NO_FAST_PATH(	push	{r0}	)
 306 | NO_FAST_PATH(	ble	3f	)
 307 | .endif
 308 | 		bic	ip, r1, #(\line_size - 1)
 309 | 		tst	r3, #3
 310 | 		pld	[ip]
 311 | .if \aligned_access == 1
 312 | FAST_PATH(	bne	30f	)
 313 | .else
 314 | FAST_PATH(	push	{r0}	)
 315 | FAST_PATH(	bne	7f	)	/* Unaligned source or destination. */
 316 | .endif
 317 | FAST_PATH(	cmp	r2, #FAST_PATH_THRESHOLD )
 318 | FAST_PATH(	bgt     10f	)
 319 | NO_FAST_PATH(	bne	30f	)
 320 | #if FAST_PATH_THRESHOLD == 0
 321 | 		/*
 322 | 		 * When the fast path is disabled, check whether there are
 323 | 		 * enough bytes for alignment, and jump to the main handling
 324 | 		 * code for larger sizes.
 325 | 		 */
 326 | .if \write_align > 0
 327 |  		cmp	r2, #(\write_align - 4)
 328 | 		bge	10f
 329 | .endif
 330 | 		push	{r4}
 331 | 		b	18f
 332 | #endif
 333 | 
 334 | 		/*
 335 | 		 * Fast path for aligned copies of size <= FAST_PATH_THRESHOLD.
 336 | 		 */
 337 | #if FAST_PATH_THRESHOLD > 0
 338 | #if SMALL_SIZE_THRESHOLD == 15
 339 |                 bics    r3, r2, #15
 340 |                 pld     [ip, #\line_size]
 341 | 		/* Jump for small sizes <= 15 bytes. */
 342 | 		beq	5f
 343 | #else
 344 | 		cmp	r2, #SMALL_SIZE_THRESHOLD
 345 |                 pld     [ip, #\line_size]
 346 | 		/* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */
 347 | 		ble	5f
 348 | 		bic	r3, r2, #15
 349 | #endif
 350 | 
 351 | 9:		/*
 352 | 		 * This is the entry-point into the fast path from
 353 | 		 * an unaligned request that has been aligned.
 354 | 		 */
 355 | 		push	{r4, r5, r6}
 356 | 
 357 |                 /*
 358 |                  * Use a heuristic to determine whether the preload
 359 |                  * at aligned_base + 2 * line_size will be useful.
 360 |                  */
 361 | .if EARLY_PREFETCHES >= 3
 362 |                 cmp     r2, #(2 * \line_size - \line_size / 2)
 363 | .endif
 364 |                 add     r5, ip, #(EARLY_PREFETCHES * \line_size)
 365 | .if EARLY_PREFETCHES >= 3
 366 |                 blt     1f
 367 | .endif
 368 | .if EARLY_PREFETCHES == 3
 369 |                 pld     [ip, #(2 * \line_size)] )
 370 | .endif
 371 | .if EARLY_PREFETCHES == 4
 372 |                 cmp     r2, #(3 * \line_size - \line_size / 2)
 373 |                 pld     [ip, #(2 * \line_size)]
 374 |                 blt     1f
 375 |                 pld     [ip, #(3 * \line_size)]
 376 | .endif
 377 | .if EARLY_PREFETCHES == 5
 378 |                 cmp     r2, #(3 * \line_size - \line_size / 2)
 379 |                 pld     [ip, #(2 * \line_size)]
 380 |                 blt     1f
 381 |                 cmp     r2, #(4 * \line_size - \line_size / 2)
 382 |                 pld     [ip, #(3 * \line_size)]
 383 |                 blt     1f
 384 |                 pld     [ip, #(4 * \line_size)]
 385 | .endif
 386 | 
 387 | 1:              /*
 388 |                  * Set r5 so that the next preload will occur
 389 |                  * exactly at aligned_base + EARLY_PREFETCHES *
 390 |                  * line_size. For example, if line_size is 64
 391 |                  * and the number of bytes is 240, the next preload
 392 |                  * will occur after processing 48 bytes, which is derived
 393 |                  * from the formula r3 & (line_size - 1),
 394 |                  * where r3 is equal to number_of_bytes & (~15).
 395 |                  */
 396 |                 rsb     r4, r3, #256
 397 |         	subs    r5, r5, r1
 398 |         	and     ip, r3, #(\line_size - 1)
 399 |                 subs    r2, r2, r3		/* Thumb16 */
 400 | THUMB(		lsrs    r4, r4, #1	)	/* Thumb16 */
 401 |         	sub     ip, r5, ip
 402 |                 add     pc, pc, r4
 403 |                 nop
 404 |                 /* >= 256 bytes to go. */
 405 |                 copy_16_bytes 256, \line_size, \prefetch_distance
 406 |                 /* >= 240 bytes go. */
 407 |                 copy_16_bytes 240, \line_size, \prefetch_distance
 408 |                 /* >= 224 bytes to go. */
 409 |                 copy_16_bytes 224, \line_size, \prefetch_distance
 410 |                 /* >= 204 bytes go. */
 411 |                 copy_16_bytes 204, \line_size, \prefetch_distance
 412 |                 /* >= 192 bytes to go. */
 413 |                 copy_16_bytes 192, \line_size, \prefetch_distance
 414 |                 /* >= 176 bytes go. */
 415 |                 copy_16_bytes 176, \line_size, \prefetch_distance
 416 |                 /* >= 160 bytes to go. */
 417 |                 copy_16_bytes 160, \line_size, \prefetch_distance
 418 |                 /* >= 144 bytes go. */
 419 |                 copy_16_bytes 144, \line_size, \prefetch_distance
 420 |                 /* >= 128 bytes to go. */
 421 |                 copy_16_bytes 128, \line_size, \prefetch_distance
 422 |                 /* >= 112 bytes go. */
 423 |                 copy_16_bytes 112, \line_size, \prefetch_distance
 424 |                 /* >= 96 bytes to go. */
 425 |                 copy_16_bytes 96, \line_size, \prefetch_distance
 426 |                 /* >= 80 bytes to go. */
 427 |                 copy_16_bytes 80, \line_size, \prefetch_distance
 428 |                 /* >= 64 bytes to go. */
 429 |                 copy_16_bytes 64, \line_size, \prefetch_distance
 430 |                 /* >= 48 bytes to go. */
 431 |                 copy_16_bytes 48, \line_size, \prefetch_distance
 432 |                 /* >= 32 bytes to go. */
 433 |                 copy_16_bytes 32, \line_size, \prefetch_distance
 434 |                 /* At this point there are 16 to 31 bytes to go. */
 435 |                 tst     r2, #15
 436 |                 ldmia   r1!, {r3, r4, r5, r6}
 437 |                 cmpne   r2, #8
 438 |                 /*
 439 |                  * If r2 == 8, we need to clear the eq flag while
 440 |                  * making sure carry remains set.
 441 |                  */
 442 |                 tsteq   r2, #15
 443 |                 stmia   r0!, {r3, r4, r5, r6}
 444 |                 /*
 445 |                  * The equal flag is set if there are no bytes left.
 446 |                  * The carry flag is set is there are >= 8 bytes left.
 447 |                  */
 448 | 		pop	{r4, r5, r6}
 449 |                 beq     4f
 450 | 
 451 | 2:
 452 | 		/*
 453 | 		 * ARM mode imposes restrictions on the registers used
 454 | 		 * in double-word loads and stored so we have to use
 455 | 		 * single-word operations.
 456 | 		 */
 457 | .if \aligned_access == 0
 458 | 	ARM(	ldrcs	r3, [r1], #4	)
 459 | 	ARM(	ldrcs	ip, [r1], #4	)
 460 | 	ARM(	strcs	r3, [r0], #4	)
 461 | 	ARM(	strcs	ip, [r0], #4	)
 462 | 	THUMB(	ldrdcs  r3, ip, [r1], #8	)
 463 | 	THUMB(	strdcs  r3, ip, [r0], #8	)
 464 | .else
 465 | 		ldrcs	r3, [r1], #4
 466 | 		ldrcs	ip, [r1], #4
 467 | 		strcs	r3, [r0], #4
 468 | 		strcs	ip, [r0], #4
 469 | .endif
 470 |                 tst     r2, #4
 471 |                 ldrne   ip, [r1], #4
 472 |                 strne   ip, [r0], #4
 473 |                 tst     r2, #3
 474 |                 popeq	{r0}
 475 |                 bxeq	lr
 476 | 
 477 | 	        /*
 478 | 		 * Handle the last up to three bytes. Unaligned access
 479 | 		 * make take place if source or destination is not
 480 | 		 * half-word aligned.
 481 | 		 */
 482 | 3:		movs    r2, r2, lsl #31
 483 |                 ldrhcs  r3, [r1], #2
 484 |                 strhcs  r3, [r0], #2
 485 |                 ldrbne  r3, [r1], #1
 486 |                 strbne  r3, [r0], #1
 487 | 4:		pop	{r0}
 488 | 		bx	lr
 489 | 
 490 | 5:		/*
 491 | 		 * Sizes <= SMALL_SIZE_THRESHOLD bytes, both source and
 492 | 		 * destination aligned.
 493 | 		 */
 494 | #if SMALL_SIZE_THRESHOLD <= 15
 495 | 		cmp	r2, #8		/* cs if r2 >= 8. */
 496 | 		b	2b
 497 | #else
 498 | 101:		tst	r2, #4
 499 | 		ldrne	r3, [r1], #4
 500 | 		subne	r2, r2, #4
 501 | 		strne	r3, [r0], #4
 502 | 		cmp	r2, #8
 503 | 		blt	3b
 504 | 6:		cmp	r2, #16
 505 | 		ldr	r3, [r1], #4
 506 | 		ldr	ip, [r1], #4
 507 | 		str	r3, [r0], #4
 508 | 		sub	r2, r2, #8
 509 | 		str	ip, [r0], #4
 510 | 		bge	6b
 511 | 		cmp	r2, #0
 512 | 		popeq	{r0}
 513 | 		bxeq	lr
 514 | 		b	3b
 515 | #endif
 516 | 
 517 | #endif	/* FAST_PATH_THRESHOLD > 0 */
 518 | 
 519 | .if \aligned_access == 1
 520 | 		/*
 521 | 		 * Handle the last up to three bytes avoiding
 522 | 		 * unaligned memory access.
 523 | 		 */
 524 | 7:		movs    r2, r2, lsl #31
 525 |                 ldrbcs  r3, [r1], #1
 526 |                 ldrbcs  ip, [r1], #1
 527 |                 strbcs  r3, [r0], #1
 528 |                 strbcs  ip, [r0], #1
 529 |                 ldrbne  r3, [r1], #1
 530 |                 strbne  r3, [r0], #1
 531 | 		pop	{r0}
 532 | 		bx	lr
 533 | .endif
 534 | 
 535 | #if FAST_PATH_THRESHOLD > 0
 536 | .if \aligned_access == 0
 537 | 7:		/*
 538 | 		 * Unaligned source or destination. There are seperate small
 539 | 		 * size thresholds for when both source and destination are
 540 | 		 * unaligned and the other case.
 541 | 		 */
 542 | 		tst	r0, #3
 543 | 		mov	r3, #UNALIGNED_SMALL_SIZE_THRESHOLD
 544 | 		tstne	r1, #3
 545 | 		movne	r3, #BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD
 546 | 		cmp	r2, r3
 547 | 		bgt	30f
 548 | 
 549 | 		/* Small sizes, unaligned case. Use single word load/stores. */
 550 | #if SMALL_SIZE_THRESHOLD >= 16
 551 | 		/* Use the identical code path already defined above. */
 552 | 		b	101b
 553 | #else
 554 | 		tst	r2, #4
 555 | 		ldrne	r3, [r1], #4
 556 | 		subne	r2, r2, #4
 557 | 		strne	r3, [r0], #4
 558 | 		cmp	r2, #8
 559 | 		blt	3b
 560 | 8:		cmp	r2, #16
 561 | 		ldr	r3, [r1], #4
 562 | 		ldr	ip, [r1], #4
 563 | 		str	r3, [r0], #4
 564 | 		sub	r2, r2, #8
 565 | 		str	ip, [r0], #4
 566 | 		bge	8b
 567 | 		b	3b
 568 | #endif
 569 | .endif
 570 | #endif		/* FAST_PATH_THRESHOLD > 0 */
 571 | 
 572 | 10:		/*
 573 | 		 * This is the start of the handling of larger sizes for
 574 | 		 * aligned copies.
 575 | 		 *
 576 | 		 * Size > FAST_PATH_THRESHOLD (256).
 577 | 		 * ip is the line_sized aligned source address for preloads.
 578 | 		 */
 579 | 
 580 | .if \write_align >= 16
 581 | 		ands	r3, r0, #(\write_align - 1)
 582 | 		push	{r4}
 583 | 		rsb	r3, r3, #\write_align
 584 | 		beq	17f
 585 | 		push	{lr}
 586 | 		bl	20f
 587 | 		pop	{lr}
 588 | 17:
 589 | .elseif \write_align == 8
 590 | 		/*
 591 | 		 * For write alignment of 8, it is quickest to do a simple
 592 | 		 * conditional load/store.
 593 | 		 */
 594 | 		tst	r0, #4
 595 | 		push	{r4}
 596 | 		ldrne	r3, [r1], #4
 597 | 		subne	r2, r2, #4
 598 | 		strne	r3, [r0], #4
 599 | .else
 600 | 		push	{r4}
 601 | .endif
 602 | 
 603 | 18:
 604 | .if (FAST_PATH_THRESHOLD - (\write_align - 4)) < \line_size
 605 | 		cmp	r2, #\line_size
 606 | 		blt	15f
 607 | .endif
 608 | 		subs	r2, r2, #\line_size
 609 | 
 610 | 16:		/*
 611 | 		 * This is the entry-point when source and destination were
 612 | 		 * initially unaligned but are now aligned because they had
 613 | 		 * the same alignment within a word. Write alignment and
 614 | 		 * size check has already been handled.
 615 | 		 */
 616 | 
 617 | 		push	{r5-r11}
 618 | 
 619 |                 /*
 620 |                  * Assume a preload at aligned base + line_size will
 621 |                  * be useful.
 622 |                  */
 623 | 		mov	r4, ip
 624 | 		pld     [ip, #\line_size]
 625 |                 add     r5, r1, #(\prefetch_distance * \line_size)
 626 |                 subs    r2, r2, #(\prefetch_distance * \line_size)
 627 |                 bic     r3, r5, #(\line_size - 1)
 628 |                 add     r4, r4, #(2 * \line_size)
 629 |                 blt     14f
 630 |                 cmp     r4, r3
 631 |                 sub     ip, r3, r1
 632 |                 /*
 633 |                  * "Catch-up" the early preloads (which have been performed up
 634 |                  * to aligned source address + line_size) to the preload offset
 635 |                  * used in the main loop.
 636 |                  */
 637 |                 bge     12f
 638 | 11:             adds    r4, r4, #\line_size		/* Thumb16 */
 639 |                 cmp     r4, r3
 640 |                 pld     [r4, #(- \line_size)]
 641 |                 blt     11b
 642 | 12:
 643 | 
 644 | 		/*
 645 | 		 * The main loop for large sizes. Copy 32 bytes at a time
 646 | 		 * using ldmia/stmia while prefetching a 32-byte aligned
 647 | 		 * address for line size 32, or 64 bytes at a time while
 648 | 		 * prefetching a 64-byte aligned address for line size 64.
 649 | 		 */
 650 | 13:		pld     [r1, ip]
 651 | 14:
 652 | .if \line_size == 32
 653 | 		ldmia   r1!, {r4-r7}
 654 | 		subs    r2, r2, #32
 655 | 		ldmia   r1!, {r8-r11}
 656 | 		stmia   r0!, {r4-r7}
 657 | 		stmia   r0!, {r8-r11}
 658 | .else
 659 | 		ldmia   r1!, {r4-r11}
 660 | 		subs    r2, r2, #64
 661 | 		stmia   r0!, {r4-r11}
 662 | 		ldmia   r1!, {r4-r11}
 663 | 		stmia   r0!, {r4-r11}
 664 | .endif
 665 | 		bge	13b
 666 | 		cmn	r2, #(\prefetch_distance * \line_size)
 667 | 		bge	14b
 668 | 		/* Correct the count. */
 669 | 		adds	r2, r2, #((\prefetch_distance + 1) * \line_size)
 670 | 		pop	{r5-r11}
 671 | 
 672 | 15:		ands	r3, r2, #60
 673 | .if \write_align <= 8
 674 | 		/*
 675 | 		 * When the subroutine is not used for write alignment, the
 676 | 		 * subroutine will only be called once, so branch without
 677 | 		 * linking.
 678 | 		 */
 679 | 		bne	20f
 680 | 19:
 681 | .else
 682 | 		mov	ip, lr
 683 | 		blne	20f
 684 | 		mov	lr, ip
 685 | .endif
 686 | 		pop	{r4}
 687 | #if FAST_PATH_THRESHOLD > 0
 688 | 		cmp	r2, #0
 689 | 		bne	3b
 690 | #else
 691 | 	ARM(	cmp	r2, #0	)
 692 | 	ARM(	beq	4f	)
 693 | 	THUMB(	cbz	r2, 4f	)
 694 | 	        /* Handle the last up to three bytes. */
 695 | 3:		movs    r2, r2, lsl #31
 696 |                 ldrhcs  r3, [r1], #2
 697 |                 strhcs  r3, [r0], #2
 698 |                 ldrbne  r3, [r1], #1
 699 |                 strbne  r3, [r0], #1
 700 | 4:
 701 | #endif
 702 | 		pop	{r0}
 703 | 		bx	lr
 704 | 
 705 |                 /*
 706 |                  * Subroutine that copies a multiple of 4 bytes of size
 707 |                  * r3 from 0 to 64 or 32 bytes. r2 is decremented by the
 708 | 		 * number of bytes copied.
 709 |                  */
 710 | 20:		tst     r3, #4
 711 |                 sub     r2, r2, r3
 712 |                 ldrne   r4, [r1], #4
 713 |                 subne   r3, r3, #4
 714 |                 strne   r4, [r0], #4
 715 | .if \write_align <= 32 && \line_size == 32
 716 |                 rsb     r3, r3, #32
 717 | .else
 718 |                 rsb     r3, r3, #64
 719 | .endif
 720 | 		/*
 721 | 		 * These ldmia/stmia instructions are 16-bit on Thumb2,
 722 | 		 * 32-bit on ARM.
 723 | 		 */
 724 | 	THUMB(	lsrs    r3, r3, #1	)
 725 |                 add     pc, pc, r3
 726 |                 nop
 727 |                 ldmia   r1!, {r3, r4}
 728 |                 stmia   r0!, {r3, r4}
 729 |                 ldmia   r1!, {r3, r4}
 730 |                 stmia   r0!, {r3, r4}
 731 |                 ldmia   r1!, {r3, r4}
 732 |                 stmia   r0!, {r3, r4}
 733 |                 ldmia   r1!, {r3, r4}
 734 |                 stmia   r0!, {r3, r4}
 735 | .if \write_align > 32 || \line_size > 32
 736 |                 ldmia   r1!, {r3, r4}
 737 |                 stmia   r0!, {r3, r4}
 738 |                 ldmia   r1!, {r3, r4}
 739 |                 stmia   r0!, {r3, r4}
 740 |                 ldmia   r1!, {r3, r4}
 741 |                 stmia   r0!, {r3, r4}
 742 |                 ldmia   r1!, {r3, r4}
 743 |                 stmia   r0!, {r3, r4}
 744 | .endif
 745 | .if \write_align <= 8
 746 | 		b	19b
 747 | .else
 748 | 		mov	pc, lr
 749 | .endif
 750 | 
 751 | 30:		/*
 752 | 		 * Unaligned case. Align the destination.
 753 | 		 * Number of bytes is > UNALIGNED_SMALL_SIZE_THRESHOLD.
 754 | 		 * Note: This may use unaligned access.
 755 | 		 * ip is the line_size aligned source address for preloads.
 756 | 		 */
 757 | 		ands	r3, r0, #3
 758 | 		push	{r4}
 759 | 		andeq	r3, r1, #3
 760 | 		beq	40f	/* Destination is aligned but source is not. */
 761 | 		/* Align the destination. */
 762 | 		cmp	r3, #2
 763 | .if \aligned_access == 1
 764 |                 ldrble  r4, [r1], #1
 765 | 		ldrble	r3, [r1], #1
 766 | 		suble	r2, r2, #2
 767 | 		strble	r4, [r0], #1
 768 | 		strble	r3, [r0], #1
 769 | .else
 770 |                 ldrhle  r4, [r1], #2
 771 | 		suble	r2, r2, #2
 772 | 		strhle	r4, [r0], #2
 773 | .endif
 774 | 		ldrbne	r4, [r1], #1
 775 | 		subne	r2, r2, #1
 776 | 		strbne	r4, [r0], #1
 777 | 		ands	r3, r1, #3
 778 | 		bne	40f	/* Destination is aligned but source is not. */
 779 | 
 780 | #if 0 && FAST_PATH_THRESHOLD > 0
 781 | 		/*
 782 | 		 * Source and destination are now aligned.
 783 | 		 * Now recreate the situation of a word-aligned memcpy
 784 | 		 * with the current source and destination,
 785 | 		 * which may require an extra preload instruction.
 786 | 		 *
 787 | 		 * This path is currently disabled disabled in favour
 788 | 		 * of the one below this which does write alignment and
 789 | 		 * jumps into the main loop for larger sizes.
 790 | 		 */
 791 | 		bic	r3, r1, #(\line_size - 1)
 792 | 		pop	{r4}
 793 | 		cmp	r3, ip
 794 | 	THUMB(	pldne	[r3]				)
 795 | 	THUMB(	cmp	r2, #FAST_PATH_THRESHOLD	)
 796 | 	THUMB(	mov	ip, r3				)
 797 | 	ARM(	beq	31f				)
 798 | 	ARM(	pld	[r3]				)
 799 | 	ARM(	mov	ip, r3				)
 800 | 31:	ARM(	cmp	r2, #FAST_PATH_THRESHOLD	)
 801 | 		bgt	10b
 802 | 
 803 | 		/*
 804 | 		 * Recreate the fast path small size check here,
 805 | 		 * but only if it necessary.
 806 | 		 */
 807 | .if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) <= SMALL_SIZE_THRESHOLD ||
 808 | \aligned_access == 1
 809 | 		cmp	r2, #SMALL_SIZE_THRESHOLD
 810 |                 pld     [ip, #\line_size]
 811 | 		/* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */
 812 | 		ble	5b
 813 | .else
 814 | 		pld	[ip, #\line_size]
 815 | .endif
 816 | 		bic	r3, r2, #15
 817 | 		b	9b
 818 | 
 819 | #else
 820 | 		/*
 821 | 		 * Source and destination are now aligned. Check carefully
 822 | 		 * whether there are enough bytes to do alignment.
 823 | 		 */
 824 | .if \write_align > 0
 825 | .if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < (\write_align - 4) \
 826 | || \aligned_access == 1
 827 | 		cmp	r2, #(\write_align - 4)
 828 | 		blt	31f
 829 | .endif
 830 | .if \write_align == 8
 831 | 		/*
 832 | 		 * For write alignment of 8, it is quickest to do a simple
 833 | 		 * conditional load/store.
 834 | 		 */
 835 | 		tst	r0, #4
 836 | 		ldrne	r3, [r1], #4
 837 | 		subne	r2, r2, #4
 838 | 		strne	r3, [r0], #4
 839 | .else
 840 | 		ands	r3, r0, #(\write_align - 1)
 841 | 		rsb	r3, r3, #\write_align
 842 | 		beq	31f
 843 | 		push	{lr}
 844 | 		bl	20b
 845 | 		pop	{lr}
 846 | .endif
 847 | 
 848 | 31:		/*
 849 | 		 * Check whether there are enough bytes to do one iteration
 850 | 		 * of the main loop.
 851 | 		 */
 852 | .if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3 - (\write_align - 4)) < \line_size \
 853 | || \aligned_access == 1
 854 | 		cmp	r2, #\line_size
 855 | 		blt	15b
 856 | .endif
 857 | 		subs	r2, r2, #\line_size
 858 | .else
 859 | 		/*
 860 | 		 * No write alignment. Only have to check for enough bytes to
 861 | 		 * do one iteration of the main loop.
 862 | 		 */
 863 | 
 864 | .if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < \line_size \
 865 | || \aligned_access == 1
 866 | 		cmp	r2, #\line_size
 867 | 		blt	15b
 868 | .endif
 869 | 		subs	r2, r2, #\line_size
 870 | .endif
 871 | 		b	16b
 872 | #endif
 873 | 
 874 | 40:		/*
 875 | 		 * Unaligned case. Size is > SMALL_SIZE_THRESHOLD - 3.
 876 | 		 */
 877 | 		bic	r1, r1, #3
 878 | 		cmp	r3, #2
 879 | 		ldr	r3, [r1], #4
 880 | 		beq	41f
 881 | 		bgt	42f
 882 | 
 883 | 		unaligned_copy 8, \line_size, \prefetch_distance, \
 884 | 			\write_align, \aligned_access
 885 | 
 886 | 41:		unaligned_copy 16, \line_size, \prefetch_distance, \
 887 | 			\write_align, \aligned_access
 888 | 
 889 | 42:		unaligned_copy 24, \line_size, \prefetch_distance, \
 890 | 			\write_align, \aligned_access
 891 | 
 892 | .endm
 893 | 
 894 | /*
 895 |  * The following is a NEON-based memcpy implementation that may use unaligned
 896 |  * access, but NEON instruction addresses are always at least element aligned.
 897 |  * It is optimized for both Thumb2 (CONFIG_THUMB) and ARM mode.
 898 |  *
 899 |  * - line_size is the cache line size used for prefetches. Must be 64 or 32.
 900 |  * - prefetch_distance is the number of cache lines to look ahead and must be
 901 |  *   >= 2, or 0 to disable prefetching in the main copying loop.
 902 |  * - early_prefetch indicates whether to perform early preloads. Must be 0 or 1.
 903 |  *   When prefetch_distance > 0, early_prefetch should be 1. To remove all PLD
 904 |  *   instructions altogether, set both prefetch_distance and early_prefetch
 905 |  *   to 0.
 906 |  */
 907 | 
 908 | .macro neon_memcpy_variant line_size, prefetch_distance, early_prefetch
 909 | 
 910 | 		cmp	r2, #3
 911 | .if \prefetch_distance > 0 || \early_prefetch == 1
 912 | 		push	{r0}
 913 | .else
 914 | 		mov	ip, r0
 915 | .endif
 916 | 		orr	r3, r0, r1
 917 | 		ble	8f
 918 | .if \prefetch_distance > 0 || \early_prefetch == 1
 919 | 		bic	ip, r1, #(\line_size - 1)
 920 | .endif
 921 | 		tst	r3, #3
 922 | .if \early_prefetch == 1
 923 | 		pld	[ip]
 924 | .endif
 925 | 		bne	10f		/* Unaligned source or destination. */
 926 | 		push	{r4}
 927 | 
 928 | 		/* Aligned source and destination. */
 929 | 1:		cmp	r2, #256
 930 | 		/*
 931 | 		 * Jump to word-aligned NEON fast path <= 256 bytes.
 932 |                  */
 933 | 		ble	18f
 934 | 		subs	r2, r2, #\line_size
 935 | 
 936 |                 /* Align to a 32-byte boundary. */
 937 | #ifdef CONFIG_THUMB
 938 | 		/*
 939 | 		 * Use conditional NEON instructions when
 940 | 		 * available (Thumb2 mode)
 941 | 		 */
 942 |                 ands    r4, r0, #31
 943 |                 rsb     r4, r4, #32
 944 |                 beq     31f
 945 |                 tst     r4, #4
 946 |                 sub     r2, r2, r4
 947 |                 ldrne   r3, [r1 :32], #4
 948 |                 strne   r3, [r0 :32], #4
 949 |                 tst     r4, #8
 950 | 		vld1ne.32 {d0}, [r1]!
 951 | 		vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
 952 |                 cmp     r4, #16
 953 | 		vld1ge.32 {d2, d3}, [r1]!
 954 | 		vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
 955 | #else
 956 | 		/*
 957 | 		 * Otherwise, branch into a series of single
 958 | 		 * loads/stores.
 959 | 	 	 */
 960 |                 ands    r4, r0, #31
 961 |                 beq     31f
 962 | 		rsb	r3, r4, #32
 963 | 		lsl	r4, r4, #1
 964 | 		sub	r2, r2, r3
 965 | 		add	pc, pc, r4
 966 | 		nop
 967 | 		ldr	r3, [r1], #4
 968 | 		str	r3, [r0], #4
 969 | 		ldr	r4, [r1], #4
 970 | 		str	r4, [r0], #4
 971 | 		ldr	r3, [r1], #4
 972 | 		str	r3, [r0], #4
 973 | 		ldr	r4, [r1], #4
 974 | 		str	r4, [r0], #4
 975 | 		ldr	r3, [r1], #4
 976 | 		str	r3, [r0], #4
 977 | 		ldr	r4, [r1], #4
 978 | 		str	r4, [r0], #4
 979 | 		ldr	r3, [r1], #4
 980 | 		str	r3, [r0], #4
 981 | 		ldr	r4, [r1], #4
 982 | 		str	r4, [r0], #4
 983 | #endif
 984 |                 cmp     r2, #0
 985 | 		addlt	r2, r2, \line_size
 986 |                 blt     6f
 987 | 
 988 | 31:
 989 | .if \early_prefetch == 1
 990 | 		pld     [ip, #\line_size]
 991 | .endif
 992 | .if \prefetch_distance > 0
 993 |                 /*
 994 |                  * Assume a preload at aligned base + line_size will
 995 |                  * be useful.
 996 |                  */
 997 | 		push	{r5}
 998 | 		mov	r4, ip
 999 |                 add     r5, r1, #(\prefetch_distance * \line_size)
1000 |                 subs    r2, r2, #(\prefetch_distance * \line_size)
1001 |                 bic     r3, r5, #(\line_size - 1)
1002 |                 add     r4, r4, #(2 * \line_size)
1003 |                 blt     5f
1004 |                 cmp     r4, r3
1005 |                 sub     ip, r3, r1
1006 |                 /*
1007 |                  * "Catch-up" the early preloads (which have been performed up
1008 |                  * to aligned source address + line_size) to the preload offset
1009 |                  * used in the main loop.
1010 |                  */
1011 |                 bge     3f
1012 | 2:              adds    r4, r4, #\line_size		/* Thumb16 */
1013 |                 cmp     r4, r3
1014 |                 pld     [r4, #(- \line_size)]
1015 |                 blt     2b
1016 | 3:
1017 | .endif
1018 | 
1019 | 		sub	ip, ip, #\line_size
1020 | 4:
1021 |                 /*
1022 |                  * Since the destination is 32-byte aligned,
1023 |                  * specify 256-bit alignment for the NEON stores.
1024 |                  */
1025 | .if \line_size == 32
1026 | 		vld1.32 {d0-d3}, [r1]!
1027 |                 subs    r2, r2, #32
1028 | .if \prefetch_distance > 0
1029 | 		pld	[r1, ip]
1030 | .endif
1031 |                 vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
1032 | .else	/* line_size == 64 */
1033 | 		vld1.32 {d0-d3}, [r1]!
1034 |                 vld1.32 {d4-d7}, [r1]!
1035 | .if \prefetch_distance > 0
1036 | 		pld	[r1, ip]
1037 | .endif
1038 |                 vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
1039 |                 subs    r2, r2, #64
1040 |                 vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
1041 | .endif
1042 |                 bge     4b
1043 | .if \prefetch_distance > 0
1044 | 5:
1045 | .if \line_size == 32
1046 | 		vld1.32 {d0-d3}, [r1]!
1047 |                 subs    r2, r2, #32
1048 |                 vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
1049 | .else	/* line_size == 64 */
1050 | 		vld1.32 {d0-d3}, [r1]!
1051 |                 vld1.32 {d4-d7}, [r1]!
1052 |                 vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
1053 |                 subs    r2, r2, #64
1054 |                 vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
1055 | .endif
1056 |         	cmn     r2, #(\prefetch_distance * \line_size)
1057 |         	bge     5b
1058 | .endif
1059 |                 /* Correct the count. */
1060 | 23:     	adds    r2, r2, #((\prefetch_distance + 1) * \line_size)
1061 | .if \prefetch_distance > 0
1062 | 		pop	{r5}
1063 | .endif
1064 | 
1065 |                 /*
1066 | 		 * Process the last 0-(line_size - 1) bytes, destination
1067 | 		 * 32-byte aligned, source word aligned.
1068 | 		 */
1069 | 6:
1070 | #ifdef CONFIG_THUMB
1071 | 		/*
1072 | 		 * Use conditional NEON instructions when
1073 | 		 * available (Thumb2 mode).
1074 | 		 */
1075 | .if \line_size == 64
1076 | 		cmp     r2, #32
1077 |                 vld1ge.32 {d0-d3}, [r1]!
1078 |                 vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]!
1079 |                 tst     r2, #16
1080 |                 vld1ne.32 {d0, d1}, [r1]!
1081 |                 vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
1082 | .else
1083 |                 cmp     r2, #16
1084 |                 vld1ge.32 {d0, d1}, [r1]!
1085 |                 vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
1086 | .endif
1087 |                 tst     r2, #8
1088 |                 vld1ne.32 {d2}, [r1]!
1089 |                 vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]!
1090 |                 tst     r2, #4
1091 |                 ldrne   r3, [r1], #4
1092 |                 strne   r3, [r0 :32], #4
1093 | 
1094 |                 pop     {r4}
1095 | #else
1096 | 		/*
1097 | 		 * Just use the world-aligned tail code if we
1098 |                  * don't have Thumb2.
1099 | 		 */
1100 | 		b	17f
1101 | #endif
1102 | 
1103 | 	        /*
1104 | 		 * Handle the last up to three bytes. Unaligned access
1105 | 		 * may take place if source or destination is not
1106 | 		 * half-word aligned.
1107 | 		 */
1108 | 8:		movs    r2, r2, lsl #31
1109 |                 ldrhcs  r3, [r1], #2
1110 |                 strhcs  r3, [r0], #2
1111 |                 ldrbne  r3, [r1], #1
1112 |                 strbne  r3, [r0]
1113 | 9:
1114 | .if \prefetch_distance > 0 || \early_prefetch == 1
1115 | 		pop	{r0}
1116 | .else
1117 | 		mov	r0, ip
1118 | .endif
1119 | 		bx	lr
1120 | 
1121 | 10:             /*
1122 |                  * Unaligned case. Align the destination.
1123 |                  * Number of bytes is > 3.
1124 |                  * Note: This may use unaligned access.
1125 |                  * ip is the line_size aligned source address for preloads.
1126 |                  */
1127 | 		cmp	r2, #64
1128 |                 push    {r4}
1129 | 		/* For small sizes < 64 bytes just use the unaligned tail code. */
1130 | 		blt	16f
1131 | 		ands	r3, r0, #3
1132 |                 beq     11f     /* Destination is aligned but source is not. */
1133 |                 /* Align the destination. */
1134 | 		cmp	r3, #2
1135 |                 ldrbne  r4, [r1], #1
1136 |                 subne   r2, r2, #1
1137 |                 strbne  r4, [r0], #1
1138 |                 ldrhle  r4, [r1], #2
1139 |                 suble   r2, r2, #2
1140 |                 strhle  r4, [r0], #2
1141 |                 tst	r1, #3
1142 |                 beq     1b      /* Destination and source are now aligned. */
1143 | 		/* Destination is now aligned to a word boundary. */
1144 | 11:
1145 | 		cmp	r2, #64
1146 | 		/*
1147 | 		 * Jump to non-aligned NEON tail code for <= 64 bytes.
1148 |                  */
1149 | 		ble	16f
1150 | 		subs	r2, r2, #\line_size
1151 | 
1152 |                 /* Align destination to a 32-byte boundary. */
1153 |                 ands    r4, r0, #31
1154 |                 rsb     r4, r4, #32
1155 |                 beq     20f
1156 |                 tst     r4, #4
1157 |                 sub     r2, r2, r4
1158 |                 ldrne   r3, [r1 :8], #4         /* Unaligned access. */
1159 |                 strne   r3, [r0 :32], #4
1160 |                 tst     r4, #8
1161 | #ifdef CONFIG_THUMB
1162 | 		/*
1163 | 		 * Use conditional NEON instructions when
1164 | 		 * available (Thumb2 mode)
1165 | 		 */
1166 |                 vld1ne.8 {d0}, [r1]!
1167 |                 vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
1168 |                 cmp     r4, #16
1169 |                 vld1ge.8 {d2, d3}, [r1]!
1170 |                 vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
1171 | #else
1172 | 		beq	31f
1173 |                 vld1.8 {d0}, [r1]!
1174 |                 vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
1175 | 31:             cmp     r4, #16
1176 | 		blt	32f
1177 |                 vld1.8 {d2, d3}, [r1]!
1178 |                 vst1.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
1179 | 32:
1180 | #endif
1181 |                 cmp     r2, #0
1182 |                 addlt   r2, r2, #\line_size
1183 |                 blt     16f
1184 | 20:
1185 | 
1186 | .if \early_prefetch == 1
1187 | 		pld     [ip, #\line_size]
1188 | .endif
1189 | .if \prefetch_distance > 0
1190 |                 /*
1191 |                  * Assume a preload at aligned base + line_size will
1192 |                  * be useful.
1193 |                  */
1194 | 		push	{r5}
1195 | 		mov	r4, ip
1196 |                 add     r5, r1, #(\prefetch_distance * \line_size)
1197 |                 subs    r2, r2, #(\prefetch_distance * \line_size)
1198 |                 bic     r3, r5, #(\line_size - 1)
1199 |                 add     r4, r4, #(2 * \line_size)
1200 |                 blt     15f
1201 |                 cmp     r4, r3
1202 |                 sub     ip, r3, r1
1203 |                 /*
1204 |                  * "Catch-up" the early preloads (which have been performed up
1205 |                  * to aligned source address + line_size) to the preload offset
1206 |                  * used in the main loop.
1207 |                  */
1208 |                 bge     13f
1209 | 12:             adds    r4, r4, #\line_size		/* Thumb16 */
1210 |                 cmp     r4, r3
1211 |                 pld     [r4, #(- \line_size)]
1212 |                 blt     12b
1213 | .endif
1214 | 
1215 | 13:
1216 |                 /*
1217 |                  * Process 64 unaligned bytes from source at a time and copy
1218 |                  * them to the 32-byte aligned destination.
1219 |                  */
1220 | 14:
1221 | .if \prefetch_distance > 0
1222 | 		pld     [r1, ip]
1223 | .endif
1224 | 15:
1225 | .if \line_size == 32
1226 | 		vld1.8  {d0-d3}, [r1]!
1227 |                 subs    r2, r2, #32
1228 |                 vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
1229 | .else	/* line_size == 64 */
1230 | 		vld1.8  {d0-d3}, [r1]!
1231 |                 vld1.8  {d4-d7}, [r1]!
1232 |                 vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
1233 |                 subs    r2, r2, #64
1234 |                 vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
1235 | .endif
1236 |                 bge     14b
1237 | .if \prefetch_distance > 0
1238 | 		cmn     r2, #(\prefetch_distance * \line_size)
1239 |         	bge     15b
1240 | .endif
1241 |                 /* Correct the count. */
1242 |                 adds    r2, r2, #((\prefetch_distance + 1) * \line_size)
1243 | .if \prefetch_distance > 0
1244 |         	pop     {r5}
1245 | .endif
1246 | 
1247 | 		/*
1248 | 		 * Handle last 0-(line_size - 1) bytes (destination 32-byte
1249 | 		 * aligned source unaligned).
1250 | 		 */
1251 | #ifdef CONFIG_THUMB
1252 | 		/*
1253 | 		 * Use conditional NEON instructions when
1254 | 		 * available (Thumb2 mode)
1255 | 		 */
1256 | .if \line_size == 64
1257 | 		cmp     r2, #32
1258 |                 vld1ge.8 {d0-d3}, [r1]!
1259 |                 vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]!
1260 |                 tst     r2, #16
1261 |                 vld1ne.8 {d0, d1}, [r1]!
1262 |                 vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
1263 | .else
1264 |                 cmp     r2, #16
1265 |                 vld1ge.8 {d0, d1}, [r1]!
1266 |                 vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
1267 | .endif
1268 |                 tst     r2, #8
1269 |                 vld1ne.8 {d2}, [r1]!
1270 |                 vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]!
1271 |                 tst     r2, #4
1272 |                 ldrne   r3, [r1], #4
1273 |                 strne   r3, [r0 :32], #4
1274 | 
1275 |                 pop     {r4}
1276 | 		b	8b
1277 | #else
1278 | 		/*
1279 | 		 * Fall through to the code below. It is not entirely
1280 | 		 * optimal because it does not indicate the destination
1281 | 		 * is word aligned.
1282 | 		 */
1283 | #endif
1284 | 
1285 |                 /* Handle small size of 0-63 bytes, unaligned. */
1286 | 16:		bic	r3, r2, #7
1287 | 		rsb	r4, r3, #64
1288 | 		tst	r2, #7
1289 | 		add	pc, pc, r4
1290 | 		nop
1291 | 		vld1.8 {d0}, [r1]!
1292 | 		vst1.8 {d0}, [r0]!
1293 | 		vld1.8 {d1}, [r1]!
1294 | 		vst1.8 {d1}, [r0]!
1295 | 		vld1.8 {d0}, [r1]!
1296 | 		vst1.8 {d0}, [r0]!
1297 | 		vld1.8 {d1}, [r1]!
1298 | 		vst1.8 {d1}, [r0]!
1299 | 		vld1.8 {d0}, [r1]!
1300 | 		vst1.8 {d0}, [r0]!
1301 | 		vld1.8 {d1}, [r1]!
1302 | 		vst1.8 {d1}, [r0]!
1303 | 		vld1.8 {d0}, [r1]!
1304 | 		vst1.8 {d0}, [r0]!
1305 | 		vld1.8 {d1}, [r1]!
1306 | 		vst1.8 {d1}, [r0]!
1307 |                 pop     {r4}
1308 | 		beq	9b
1309 |                 tst     r2, #4
1310 |                 ldrne   r3, [r1 :8], #4         /* Unaligned access. */
1311 |                 strne   r3, [r0], #4
1312 | 		b	8b
1313 | 
1314 |                 /* Handle small size of 0-63 bytes, word aligned. */
1315 | 17:
1316 | #ifdef CONFIG_THUMB
1317 | 		cmp     r2, #32
1318 |                 vld1ge.32 {d0-d3}, [r1]!
1319 |                 vst1ge.32 {d0-d3}, [r0]!
1320 |                 tst     r2, #16
1321 |                 vld1ne.32 {d0, d1}, [r1]!
1322 |                 vst1ne.32 {d0, d1}, [r0]!
1323 |                 tst     r2, #8
1324 |                 vld1ne.32 {d2}, [r1]!
1325 |                 vst1ne.32 {d2}, [r0]!
1326 | 		tst	r2, #7
1327 | #else
1328 | 		bic	r3, r2, #7
1329 | 		rsb	r4, r3, #64
1330 | 		tst	r2, #7
1331 | 		add	pc, pc, r4
1332 | 		nop
1333 | 		vld1.32 {d0}, [r1]!
1334 | 		vst1.32 {d0}, [r0]!
1335 | 		vld1.32 {d1}, [r1]!
1336 | 		vst1.32 {d1}, [r0]!
1337 | 		vld1.32 {d0}, [r1]!
1338 | 		vst1.32 {d0}, [r0]!
1339 | 		vld1.32 {d1}, [r1]!
1340 | 		vst1.32 {d1}, [r0]!
1341 | 		vld1.32 {d0}, [r1]!
1342 | 		vst1.32 {d0}, [r0]!
1343 | 		vld1.32 {d1}, [r1]!
1344 | 		vst1.32 {d1}, [r0]!
1345 | 		vld1.32 {d0}, [r1]!
1346 | 		vst1.32 {d0}, [r0]!
1347 | 		vld1.32 {d1}, [r1]!
1348 | 		vst1.32 {d1}, [r0]!
1349 | #endif
1350 |                 pop     {r4}
1351 | 		beq	9b
1352 | 		tst	r2, #4
1353 | 		ldrne	r3, [r1], #4
1354 | 		strne	r3, [r0], #4
1355 | 		b	8b
1356 | 
1357 | 		/*
1358 | 		 * Fast path for <= 256 bytes, word aligned.
1359 | 		 * This is hardcoded for a preload offset of 128 bytes,
1360 | 		 * which seems to work well in practice for small sizes.
1361 | 		 */
1362 | 18:		bics	r3, r2, #31
1363 | .if \early_prefetch == 1
1364 | 		pld	[ip, #32]
1365 | 		beq	21f
1366 | 		pld	[ip, #64]
1367 | 		pld	[ip, #96]
1368 | .endif
1369 | 		rsb	r4, r3, #256
1370 | 		ands	r2, r2, #31
1371 | 		/*
1372 | 		 * Each code block handling 32 bytes is
1373 |                  * 12 bytes long.
1374 | 		 */
1375 | 		lsr	r4, r4, #2
1376 |                 add     ip, ip, #128
1377 | 		add	r4, r4, r4, lsr #1
1378 | 		sub	ip, ip, r1
1379 | 		add	pc, pc, r4
1380 | 		nop
1381 | 		pld	[r1, ip]
1382 | 		vld1.32 {d0-d3}, [r1]!
1383 | 		vst1.32 {d0-d3}, [r0]!
1384 | 		pld	[r1, ip]
1385 | 		vld1.32 {d4-d7}, [r1]!
1386 | 		vst1.32 {d4-d7}, [r0]!
1387 | 		pld	[r1, ip]
1388 | 		vld1.32 {d0-d3}, [r1]!
1389 | 		vst1.32 {d0-d3}, [r0]!
1390 | 		pld	[r1, ip]
1391 | 		vld1.32 {d4-d7}, [r1]!
1392 | 		vst1.32 {d4-d7}, [r0]!
1393 | 		pld	[r1, ip]
1394 | 		vld1.32 {d0-d3}, [r1]!
1395 | 		vst1.32 {d0-d3}, [r0]!
1396 | 		W(nop)
1397 | 		vld1.32 {d4-d7}, [r1]!
1398 | 		vst1.32 {d4-d7}, [r0]!
1399 | 		W(nop)
1400 | 		vld1.32 {d0-d3}, [r1]!
1401 | 		vst1.32 {d0-d3}, [r0]!
1402 | 		W(nop)
1403 | 		vld1.32 {d4-d7}, [r1]!
1404 | 		vst1.32 {d4-d7}, [r0]!
1405 | 		beq	19f
1406 | 21:
1407 | #ifdef CONFIG_THUMB
1408 | 		cmp	r2, #16
1409 | 		vld1ge.32 {d0-d1}, [r1]!
1410 | 		vst1ge.32 {d0-d1}, [r0]!
1411 | 		tst	r2, #8
1412 | 		vld1ne.32 {d0}, [r1]!
1413 | 		vst1ne.32 {d0}, [r0]!
1414 | #else
1415 | 		cmp	r2, #16
1416 | 		ldmiage r1!, {r3, r4}
1417 | 		stmiage r0!, {r3, r4}
1418 | 		ldmiage r1!, {r3, r4}
1419 | 		stmiage r0!, {r3, r4}
1420 | 		tst	r2, #8
1421 | 		ldmiane r1!, {r3, r4}
1422 | 		stmiane r0!, {r3, r4}
1423 | #endif
1424 |                 tst     r2, #4
1425 | 		pop	{r4}
1426 |                 ldrne   r3, [r1], #4
1427 |                 strne   r3, [r0 :32], #4
1428 | 		and	r2, r2, #3
1429 | 		b	8b
1430 | 19:
1431 | 		pop	{r4}
1432 | .if \prefetch_distance > 0 || \early_prefetch == 1
1433 | 		pop	{r0}
1434 | .else
1435 | 		mov	r0, ip
1436 | .endif
1437 | 		bx	lr
1438 | .endm
1439 | 
1440 | 
1441 | #if defined(MEMCPY_REPLACEMENT_RPI) || defined(MEMCPY_REPLACEMENT_ARMV7_32) \
1442 | || defined(MEMCPY_REPLACEMENT_ARMV7_64) || defined(MEMCPY_REPLACEMENT_NEON_32) \
1443 | || defined(MEMCPY_REPLACEMENT_NEON_64)
1444 | 
1445 | #ifdef MEMCPY_REPLACEMENT_RPI
1446 | asm_function memcpy
1447 | 		memcpy_variant 32, 3, 8, 0
1448 | .endfunc
1449 | #endif
1450 | 
1451 | #ifdef MEMCPY_REPLACEMENT_ARMV7_32
1452 | asm_function memcpy
1453 | 		memcpy_variant 32, 6, 0, 0
1454 | .endfunc
1455 | #endif
1456 | 
1457 | #ifdef MEMCPY_REPLACEMENT_ARMV7_64
1458 | asm_function memcpy
1459 | 		memcpy_variant 64, 3, 0, 0
1460 | .endfunc
1461 | #endif
1462 | 
1463 | #ifdef MEMCPY_REPLACEMENT_NEON_32
1464 | asm_function memcpy
1465 | 		neon_memcpy_variant 32, 6, 1
1466 | .endfunc
1467 | #endif
1468 | 
1469 | #ifdef MEMCPY_REPLACEMENT_NEON_64
1470 | asm_function memcpy
1471 | 		neon_memcpy_variant 64, 3, 1
1472 | .endfunc
1473 | #endif
1474 | 
1475 | #ifdef MEMCPY_REPLACEMENT_NEON_AUTO
1476 | asm_function memcpy
1477 | 		neon_memcpy_variant 32, 0, 1
1478 | .endfunc
1479 | #endif
1480 | 
1481 | #else
1482 | 
1483 | asm_function memcpy_new_line_size_64_preload_192
1484 | 		memcpy_variant 64, 3, 0, 0
1485 | .endfunc
1486 | 
1487 | asm_function memcpy_new_line_size_64_preload_192_align_32
1488 | 		memcpy_variant 64, 3, 32, 0
1489 | .endfunc
1490 | 
1491 | asm_function memcpy_new_line_size_64_preload_192_aligned_access
1492 | 		memcpy_variant 64, 3, 0, 1
1493 | .endfunc
1494 | 
1495 | asm_function memcpy_new_line_size_32_preload_192
1496 | 		memcpy_variant 32, 6, 0, 0
1497 | .endfunc
1498 | 
1499 | asm_function memcpy_new_line_size_32_preload_192_align_32
1500 | 		memcpy_variant 32, 6, 32, 0
1501 | .endfunc
1502 | 
1503 | asm_function memcpy_new_line_size_32_preload_96
1504 | 		memcpy_variant 32, 3, 8, 0
1505 | .endfunc
1506 | 
1507 | asm_function memcpy_new_line_size_32_preload_96_aligned_access
1508 | 		memcpy_variant 32, 3, 8, 1
1509 | .endfunc
1510 | 
1511 | asm_function memcpy_new_neon_line_size_64
1512 | 		neon_memcpy_variant 64, 3, 1
1513 | .endfunc
1514 | 
1515 | asm_function memcpy_new_neon_line_size_32
1516 | 		neon_memcpy_variant 32, 6, 1
1517 | .endfunc
1518 | 
1519 | asm_function memcpy_new_neon_line_size_32_auto
1520 | 		neon_memcpy_variant 32, 0, 1
1521 | .endfunc
1522 | 
1523 | #endif
1524 | 
1525 | /*
1526 |  * Macro for memset replacement.
1527 |  * write_align must be 0, 8, or 32.
1528 |  * use_neon must be 0 or 1.
1529 |  */
1530 | 
1531 | .macro memset_variant write_align, use_neon
1532 | .if \use_neon == 1
1533 | 	.fpu neon
1534 | .endif
1535 | 	ands	r3, r0, #3
1536 | 	mov	ip, r0
1537 | 	bne	7f
1538 | 
1539 | 	/* Destination is word aligned. */
1540 | 1:	orr	r1, r1, r1, lsl #8
1541 | .if \use_neon == 1
1542 | 	cmp	r2, #16
1543 | .else
1544 | 	cmp	r2, #8
1545 | .endif
1546 | 	orr	r1, r1, r1, lsl #16
1547 | .if \use_neon == 1
1548 | 	blt	13f
1549 |         vmov	d0, r1, r1
1550 | 	vmov	d1, r1, r1
1551 | .else
1552 | 	blt	5f
1553 | 	mov	r3, r1
1554 | .endif
1555 | 
1556 | 	cmp	r2, #64
1557 | 	push 	{r4}
1558 | .if \use_neon == 1
1559 | 	blt	10f
1560 | .else
1561 | 	ble	10f
1562 | .endif
1563 | .if \write_align > 0
1564 | 	ands	r4, r0, #(\write_align - 1)
1565 | .if \use_neon == 1
1566 | #ifndef CONFIG_THUMB
1567 | 	add	r3, r4, #7
1568 | #endif
1569 | .endif
1570 | 	/* Let r4 be equal to the number of bytes to align.  */
1571 | 	rsb	r4, r4, #\write_align
1572 | 	/*
1573 | 	 * At this point r4 contains the number of bytes to align
1574 | 	 * if eq is not set. The eq flag is set if there are no bytes
1575 | 	 * to align.
1576 | 	 */
1577 | .if \write_align == 8
1578 | 	subne	r2, r2, r4
1579 | 	strne	r1, [r0], #4
1580 | .elseif \write_align == 32
1581 | 	beq	2f
1582 | 	tst     r4, #4
1583 | 	sub	r2, r2, r4
1584 | 	strne	r1, [r0], #4
1585 | .if \use_neon == 1
1586 | #ifdef CONFIG_THUMB
1587 | 	tst     r4, #8
1588 | 	vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
1589 | 	cmp	r4, #16
1590 | 	vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
1591 | #else
1592 | 	bic	r4, r3, #7
1593 | 	lsr	r4, r4, #1
1594 | 	add	pc, pc, r4
1595 | 	nop
1596 | 	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
1597 | 	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
1598 | 	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
1599 | 	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
1600 | #endif
1601 | .else
1602 | 	tst     r4, #8
1603 | 	stmiane r0!, {r1, r3}
1604 | 	cmp	r4, #16
1605 | 	stmiage r0!, {r1, r3}
1606 | 	stmiage r0!, {r1, r3}
1607 | .endif
1608 | .endif	/* \write_align == 32 */
1609 | 	cmp	r2, #64
1610 | 	blt	4f
1611 | .endif	/* \write_align > 0 */
1612 | 
1613 | 2:
1614 | .if \use_neon == 1
1615 |         /*
1616 |          * When NEON is enabled, \write_align is
1617 |          * equal to 32 so specify 256-bit alignment in the
1618 |          * NEON store instructions.
1619 |          */
1620 | 	subs	r2, r2, #64
1621 |         vmov	q1, q0
1622 | 3:	vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
1623 | 	subs	r2, r2, #64
1624 |         vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
1625 |         bge     3b
1626 | 	adds	r2, r2, #64
1627 | .else
1628 | 	mov	r4, r1
1629 | 	subs	r2, r2, #64
1630 | 	push	{r5}
1631 | 	mov	r5, r1
1632 | 
1633 | 3:	stmia	r0!, {r1, r3, r4, r5}
1634 | 	subs	r2, r2, #64		/* Thumb16 */
1635 | 	stmia	r0!, {r1, r3, r4, r5}
1636 | 	stmia	r0!, {r1, r3, r4, r5}
1637 | 	stmia	r0!, {r1, r3, r4, r5}
1638 | 	bge	3b
1639 | 	adds	r2, r2, #64		/* Thumb16 */
1640 | 
1641 | 	pop	{r5}
1642 | .endif
1643 | 	/* Early exit if there are 0 bytes left. */
1644 | /* THUMB(	cbz	r2, 9f	) */
1645 | THUMB(	cmp	r2, #0	)
1646 | THUMB(	beq	9f	)
1647 | ARM(	teq	r2, #0	)
1648 | ARM(	beq	9f	)
1649 | 	/*
1650 | 	 * Handle 8-64 bytes (or 16-63 bytes in case of NEON).
1651 | 	 * In case of NEON, destination must be 8-byte aligned.
1652 | 	 */
1653 | 4:
1654 | .if \use_neon == 1
1655 | #ifdef CONFIG_THUMB
1656 | 	vmov	q1, q0
1657 | 	cmp	r2, #32
1658 | 	vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(64)]!
1659 | 	tst	r2, #16
1660 | 	vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
1661 | 	tst	r2, #8
1662 | 	vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
1663 | 	and	r2, r2, #7
1664 | #else
1665 | 	bic	r4, r2, #15
1666 | 	subs	r2, r2, r4
1667 | 	rsb	r4, r4, #64
1668 | 	/*
1669 | 	 * When using NEON, the vst instruction
1670 | 	 * (storing 16 bytes) is always 32-bit.
1671 | 	 */
1672 | 	lsr	r4, r4, #2
1673 | 	add	pc, pc, r4
1674 | 	nop
1675 | 	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
1676 | 	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
1677 | 	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
1678 | 	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
1679 | 	cmp	r2, #8
1680 | 	strge	r1, [r0], #4
1681 | 	strge	r1, [r0], #4
1682 | 	subge	r2, r2, #8
1683 | #endif
1684 | .else	/* use_neon == 0 */
1685 | 	bic	r4, r2, #7
1686 | 	subs	r2, r2, r4
1687 | 	rsb	r4, r4, #64
1688 | 	/*
1689 | 	 * The stmia instruction (storing 8 bytes) is 32-bit for ARM,
1690 | 	 * 16-bit for Thumb2.
1691 | 	 */
1692 | THUMB(	lsrs	r4, r4, #2	)
1693 | ARM(	lsr	r4, r4, #1	)
1694 | 	add	pc, pc, r4
1695 | 	nop
1696 | 	stmia	r0!, {r1, r3}
1697 | 	stmia	r0!, {r1, r3}
1698 | 	stmia	r0!, {r1, r3}
1699 | 	stmia	r0!, {r1, r3}
1700 | 	stmia	r0!, {r1, r3}
1701 | 	stmia	r0!, {r1, r3}
1702 | 	stmia	r0!, {r1, r3}
1703 | 	stmia	r0!, {r1, r3}
1704 | .endif
1705 | 14:	pop	{r4}
1706 | 
1707 | 5:	cmp	r2, #4
1708 | 	strge	r1, [r0], #4
1709 | 	/* Early exit for multiple of 4 size. */
1710 | 	ands	r2, r2, #3
1711 | 	moveq	r0, ip
1712 | 	bxeq	lr
1713 | 
1714 | 	/*
1715 | 	 * At this point there are 1, 2 or 3 bytes,
1716 | 	 * and the destination is aligned.
1717 | 	 */
1718 | 6:	cmp	r2, #2
1719 | 	strhge	r1, [r0], #2
1720 | 	strbne	r1, [r0]
1721 | 	mov	r0, ip
1722 | 	bx	lr
1723 | 
1724 | .if \use_neon == 1
1725 | 	/* 0-15 bytes left, word aligned. */
1726 | 13:	cmp	r2, #8
1727 | 	strge	r1, [r0]
1728 | 	strge	r1, [r0, #4]
1729 | 	addge	r0, r0, #8
1730 | 	subge	r2, r2, #8
1731 | 	b	5b
1732 | .endif
1733 | 
1734 | 	/* Unaligned case. */
1735 | 7:	cmp	r2, #4
1736 | 	blt	8f
1737 | #ifdef CONFIG_THUMB
1738 | .if \use_neon == 1
1739 | 	/*
1740 | 	 * When Thumb2 is enabled with NEON, use the optimized
1741 | 	 * unaligned NEON code path for small sizes.
1742 | 	 */
1743 | 	cmp 	r2, #64
1744 | 	blt	11f
1745 | .endif
1746 | #endif
1747 | 	/* Align the destination. */
1748 | 	cmp	r3, #2
1749 | 	sub	r2, r2, #4
1750 | 	strble	r1, [r0]
1751 | 	strble	r1, [r0, #1]
1752 | 	addle	r0, r0, #2
1753 | 	add	r2, r2, r3
1754 | 	strbne	r1, [r0], #1
1755 | 	b	1b
1756 | 
1757 | 	/* 0 to 3 bytes left. */
1758 | 8:	cmp	r2, #2
1759 | 	strbge  r1, [r0]
1760 | 	strbge  r1, [r0, #1]
1761 | 	addge	r0, r0, #2
1762 | 	tst	r2, #1
1763 | 	strbne  r1, [r0]
1764 | 	mov	r0, ip
1765 | 	bx	lr
1766 | 
1767 | 9:	pop	{r4}
1768 | 	mov	r0, ip
1769 | 	bx	lr
1770 | 
1771 | 	/*
1772 | 	 * Word aligned 8 <= size <= 64
1773 | 	 * (16 <= size <= 63 in case of NEON).
1774 | 	 */
1775 | 10:
1776 | 	/* Align the destination to an 8 byte boundary. */
1777 | 	tst     r0, #4
1778 | 	strne	r1, [r0], #4
1779 | 	subne	r2, r2, #4
1780 | .if \use_neon == 1
1781 | 	cmp	r2, #16
1782 | 	poplt	{r4}
1783 | 	blt	13b
1784 | .else
1785 | 	cmp	r2, #8
1786 | 	blt	14b
1787 | .endif
1788 | 	b	4b
1789 | 
1790 | #ifdef CONFIG_THUMB
1791 | .if \use_neon == 1
1792 | 	/*
1793 | 	 * Handle 4 <= size <= 63 bytes, unaligned.
1794 | 	 * Use unaligned NEON instructions with Thumb2.
1795 | 	 */
1796 | 11:
1797 | 	orr	r1, r1, r1, lsl #8
1798 | 	tst	r2, #8
1799 | 	orr	r1, r1, r1, lsl #16
1800 |         vmov	d0, r1, r1
1801 | 	vst1ne.8 {d0}, [r0]!
1802 | 	vmov	d1, r1, r1
1803 | 	tst	r2, #16
1804 | 	vst1ne.8 {d0, d1}, [r0]!
1805 | 	vmov	q1, q0
1806 | 	cmp	r2, #32
1807 | 	and	r2, r2, #7
1808 | 	vst1ge.8 {d0-d3}, [r0]!
1809 | 	cmp	r2, #4
1810 | 	/* The following store is unaligned. */
1811 | 	strge	r1, [r0], #4
1812 | 	subge	r2, r2, #4
1813 | 	b	8b
1814 | .endif
1815 | #endif
1816 | .endm
1817 | 
1818 | #if defined(MEMSET_REPLACEMENT_RPI) || defined(MEMSET_REPLACEMENT_ARMV7_32) \
1819 | || defined(MEMSET_REPLACEMENT_ARMV7_64) || defined(MEMSET_REPLACEMENT_NEON_32) \
1820 | || defined(MEMSET_REPLACEMENT_NEON_64)
1821 | 
1822 | #ifdef MEMSET_REPLACEMENT_RPI
1823 | asm_function memset
1824 | 		memset_variant 32, 0
1825 | .endfunc
1826 | #endif
1827 | 
1828 | #if defined(MEMSET_REPLACEMENT_ARMV7_32) || defined(MEMSET_REPLACEMENT_ARMV7_64)
1829 | asm_function memset
1830 | 		memset_variant 8, 0
1831 | .endfunc
1832 | #endif
1833 | 
1834 | #if defined(MEMSET_REPLACEMENT_NEON_32) || defined(MEMSET_REPLACEMENT_NEON_64)
1835 | asm_function memset
1836 | 		memset_variant 32, 1
1837 | .endfunc
1838 | #endif
1839 | 
1840 | #else
1841 | 
1842 | asm_function memset_new_align_0
1843 | 		memset_variant 0, 0
1844 | .endfunc
1845 | 
1846 | asm_function memset_new_align_8
1847 | 		memset_variant 8, 0
1848 | .endfunc
1849 | 
1850 | asm_function memset_new_align_32
1851 | 		memset_variant 32, 0
1852 | .endfunc
1853 | 
1854 | asm_function memset_neon
1855 | 		memset_variant 32, 1
1856 | .endfunc
1857 | 
1858 | #endif
1859 | 


--------------------------------------------------------------------------------
/new_arm.h:
--------------------------------------------------------------------------------
 1 | 
 2 | extern void *memcpy_new_line_size_64_preload_192(void *dest,
 3 |     const void *src, size_t n);
 4 | 
 5 | extern void *memcpy_new_line_size_64_preload_192_align_32(void *dest,
 6 |     const void *src, size_t n);
 7 | 
 8 | extern void *memcpy_new_line_size_64_preload_192_aligned_access(void *dest,
 9 |     const void *src, size_t n);
10 | 
11 | extern void *memcpy_new_line_size_32_preload_192(void *dest,
12 |     const void *src, size_t n);
13 | 
14 | extern void *memcpy_new_line_size_32_preload_192_align_32(void *dest,
15 |     const void *src, size_t n);
16 | 
17 | extern void *memcpy_new_line_size_32_preload_96(void *dest,
18 |     const void *src, size_t n);
19 | 
20 | extern void *memcpy_new_line_size_32_preload_96_aligned_access(void *dest,
21 |     const void *src, size_t n);
22 | 
23 | extern void *memcpy_new_neon_line_size_64(void *dest, const void *src, size_t n);
24 | 
25 | extern void *memcpy_new_neon_line_size_32(void *dest, const void *src, size_t n);
26 | 
27 | extern void *memcpy_new_neon_line_size_32_auto(void *dest, const void *src, size_t n);
28 | 
29 | extern void *memset_new_align_0(void *dest, int c, size_t size);
30 | 
31 | extern void *memset_new_align_8(void *dest, int c, size_t size);
32 | 
33 | extern void *memset_new_align_32(void *dest, int c, size_t size);
34 | 
35 | extern void *memset_neon(void *dest, int c, size_t size);
36 | 


--------------------------------------------------------------------------------