├── Makefile ├── README ├── README.md ├── arm_asm.S ├── arm_asm.h ├── benchmark.c ├── memcpy-hybrid.S ├── memcpy-hybrid.h ├── new_arm.S └── new_arm.h /Makefile: -------------------------------------------------------------------------------- 1 | # PLATFORM must one of the following list and is used to select the memcpy/ 2 | # memset variants used in the replacement library (libfastarm.so) 3 | # - RPI selects optimizations for the armv6-based Raspberry Pi with a 4 | # preload offset of 96 bytes. 5 | # - ARMV7_32 selects a cache line size of 32, suitable for most Cortex 6 | # platforms. The used preload offset is 192 bytes. 7 | # - ARMV7_64 selects a cache line size of 64 bytes, suitable for potential 8 | # Cortex platforms in which all cache line fills (including from DRAM) are 9 | # 64 bytes. The used preload offset is 192 bytes. 10 | # - NEON_32 selects NEON optimizations with a cache line of 32 bytes. 11 | # The used preload offset is 192 bytes. 12 | # - NEON_64 selects NEON optimizations with a cahce line size of 64 bytes. 13 | # The used preload offset is 192 bytes. 14 | # - NEON_AUTO selects NEON optimizations for Cortex cores with a suitably 15 | # advanced automatic prefetcher that most preload instructions are unnecessary. 16 | # Only early preloads are generated. 17 | # Uncomment the THUMBFLAGS definition to compile in ARM mode as opposed to Thumb2 18 | 19 | PLATFORM = NEON_32 20 | THUMBFLAGS = -march=armv7-a -Wa,-march=armv7-a -mthumb -Wa,-mthumb \ 21 | -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB 22 | BENCHMARK_CONFIG_FLAGS = -DINCLUDE_MEMCPY_HYBRID # -DINCLUDE_LIBARMMEM_MEMCPY 23 | #LIBARMMEM = -larmmem 24 | CORTEX_STRINGS_MEMCPY_HYBRID = memcpy-hybrid.o 25 | CFLAGS = -std=gnu99 -Ofast -Wall $(BENCHMARK_CONFIG_FLAGS) 26 | PCFLAGS = -std=gnu99 -O -Wall $(BENCHMARK_CONFIG_FLAGS) -pg -ggdb 27 | 28 | all : benchmark libfastarm.so 29 | 30 | benchmark : benchmark.o arm_asm.o new_arm.o $(CORTEX_STRINGS_MEMCPY_HYBRID) 31 | $(CC) $(CFLAGS) benchmark.o arm_asm.o new_arm.o \ 32 | $(CORTEX_STRINGS_MEMCPY_HYBRID) -o benchmark -lm -lrt $(LIBARMMEM) 33 | 34 | benchmarkp : benchmark.c arm_asm.S 35 | $(CC) $(PCFLAGS) benchmark.c arm_asm.S new_arm.S -o benchmarkp -lc -lm -lrt $(LIBARMMEM) 36 | 37 | install_memcpy_replacement : libfastarm.so 38 | install -m 0755 libfastarm.so /usr/lib/arm-linux-gnueabihf/libfastarm.so 39 | @echo 'To enable the use of the enhanced memcpy by applications, edit or' 40 | @echo 'create the file /etc/ld.so.preload so that it contains the line:' 41 | @echo '/usr/lib/arm-linux-gnueabihf/libfastarm.so' 42 | @echo 'On the RPi platform, references to libcofi_rpi.so should be commented' 43 | @echo 'out or deleted.' 44 | 45 | libfastarm.so : memcpy_replacement.o 46 | $(CC) -o libfastarm.so -shared memcpy_replacement.o 47 | 48 | memcpy_replacement.o : new_arm.S 49 | $(CC) -c -s -x assembler-with-cpp $(THUMBFLAGS) \ 50 | -DMEMCPY_REPLACEMENT_$(PLATFORM) -DMEMSET_REPLACEMENT_$(PLATFORM) \ 51 | -o memcpy_replacement.o new_arm.S 52 | 53 | clean : 54 | rm -f benchmark 55 | rm -f benchmark.o 56 | rm -f benchmarkp 57 | rm -f arm_asm.s 58 | rm -f arm_asm.o 59 | rm -f new_arm.o 60 | rm -f memcpy_replacement.o 61 | rm -f libfastarm.so 62 | 63 | benchmark.o : benchmark.c arm_asm.h 64 | 65 | arm_asm.o : arm_asm.S arm_asm.h 66 | 67 | new_arm.o : new_arm.S new_arm.h 68 | 69 | memcpy-hybrid.o : memcpy-hybrid.S 70 | 71 | .c.o : 72 | $(CC) -c $(CFLAGS) $< -o $@ 73 | 74 | .S.o : 75 | $(CC) -c -s $(CFLAGS) $(THUMBFLAGS) $< -o $@ 76 | 77 | .c.s : 78 | $(CC) -S $(CFLAGS) $< -o $@ 79 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | fastarm 2 | 3 | This toolkit contains a set of fast memcpy/memset variants for ARM 4 | platforms. They either use the standard register file, or optionally 5 | NEON instructions, 6 | 7 | Several basic families of variants are provided; the current ones are 8 | the "new memcpy" variants which are the default for memcpy replacement, 9 | which generally do not overfetch beyond the source region and can be 10 | configured to use unaligned memory access for small sizes, or to use 11 | strictly aligned memory access. This family can also be configured to 12 | include a fast path for smaller sizes (this is the default), disabling 13 | this results in smaller code size at the expense of worse performance 14 | for small sizes. NEON optimized versions, which are generally faster 15 | with reduced code size, are also provided. 16 | 17 | To compile the benchmark program, run 'make'. This will compile in a 18 | plethora of variants with different preload strategies, block sizes, 19 | alignment etc. 20 | 21 | A benchmark program to compare various memcpy variants is provided. Try 22 | something like "./benchmark --memcpy ad --all". (Use --memcpy al on the 23 | Raspberry Pi platform). 24 | 25 | To compile a memcpy replacement library, set PLATFORM to one of the 26 | values described at the beginning of the Makefile. This selects the 27 | cache line size to use and whether to use NEON versions. 28 | 29 | Optionally disable Thumb2 mode compilation by commenting out the THUMBFLAGS 30 | definition. It must be disabled on the Raspberry Pi. 31 | 32 | Then run: 33 | 34 | sudo make install_memcpy_replacement 35 | 36 | The replacement memcpy/memset shared library will be installed into 37 | /usr/lib/arm-linux-gnueabihf/ as libfastarm.so. 38 | 39 | To enable the use of the replacement memcpy in applications, create or edit 40 | the file /etc/ld.so.preload so that it contains the line: 41 | 42 | /usr/lib/arm-linux-gnueabihf/libfastarm.so 43 | 44 | On the RPi platform, references to libcofi_rpi.so should be commented out 45 | or deleted. The new memcpy should now be activated for newly launched 46 | programs. To be sure, reboot or run: 47 | 48 | sudo ldconfig 49 | 50 | To revert to the default optimized memcpy on the RPi platform, 51 | edit /etc/ld.so.preload so that it contains the line: 52 | 53 | /usr/lib/arm-linux-gnueabihf/libcofi_rpi.so 54 | 55 | instead of the one using libfastarm.so. 56 | 57 | Note on cache line size: 58 | 59 | Although assuming a preload line size 64 bytes is a little faster on several 60 | Cortex platforms for small to moderate sizes, when accessing DRAM 61 | with larger sizes assuming 32 byte preloads seems to be faster. On earlier 62 | Cortex A9 models, 32 byte preloads are required for good performance in all 63 | cases. 64 | 65 | Notes on performance with and without NEON: 66 | 67 | For NEON-based memcpy, a significant benefit is seen on the tested Cortex A8 68 | platform for unaligned copies in cache memory and for aligned and unaligned 69 | copies in DRAM. Performance for aligned copies in cache memory is relatively 70 | similar to the optimized non-NEON function. 71 | 72 | Results in MB/s on a Cortex A8, with Thumb2 mode enabled, of 73 | standard libc (Debian unstable), armv7 and NEON optimized memcpy 74 | variants with line size of 32 bytes: 75 | 76 | libc armv7 NEON 77 | test 0 522 549 567 78 | test 1 329 377 378 79 | test 2 434 430 513 80 | test 28 351 361 458 81 | test 29 246 248 358 82 | test 43 467 512 581 83 | 84 | Test 0 in the benchmark program tests word-aligned requests with 85 | sizes that are a power of 2 up to 4096 bytes distributed according 86 | to a power law. 87 | Test 1 in the benchmark program tests word-aligned requests with 88 | sizes up to 1024 that are a multiple of 4, distributed according 89 | to a power law. 90 | Test 2 in the benchmark program tests unaligned requests with sizes 91 | up to 1023 bytes. 92 | Test 28 in the benchmark program tests word aligned requests in DRAM 93 | with sizes up to 1024 bytes. 94 | Test 29 in the benchmark program tests word aligned requests in DRAM 95 | with sizes up to 256 bytes. 96 | Test 43 in the benchmark program tests page aligned requests in DRAM 97 | of size 4096 (copying a memory page). 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | fastarm 2 | ======= 3 | 4 | Experimental memcpy speed toolkit for ARM CPUs. Provides optimized replacement 5 | memcpy and memset functions for armv6/armv7 platforms without NEON and NEON- 6 | optimized versions for armv7 platforms with NEON. 7 | 8 | -------------------------------------------------------------------------------- /arm_asm.S: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2006-2008, 2013 Siarhei Siamashka 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice (including the next 12 | * paragraph) shall be included in all copies or substantial portions of the 13 | * Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | * 23 | * Copyright 2013 Harm Hanemaaijer 24 | * 25 | * 1. Add ".type , function" to function definition macro, which 26 | * was required for correct linkage on my platform. 27 | * 2. Add non-overfetching memcpy version with a plethora of optimizations and variants using 28 | * macros. 29 | * To do: -- More complete implementation of write_align == 64 for unaligned case. 30 | * 31 | * On the RPi platform, a good choice is armv5te_no_overfetch_align_16_block_write_16_preload_early_128, 32 | * closely followed by armv5te_no_overfetch_align_16_block_write_16_preload_early_96. For 33 | * CPU-cache based work loads armv5te_no_overfetch_align_16_block_write_16_preload_96 might be 34 | * a little faster. 35 | * 36 | * On the Allwinner A10 platform, with the reworked version a variant with cache line size of 64, 37 | * memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_192, seems to be the 38 | * best performer. 39 | * 40 | * On the Allwinner platform, the optimized memcpy is faster; on the RPi libcofi does relatively well 41 | * and the optimal memcpy depends on the application. 42 | */ 43 | 44 | /* Prevent the stack from becoming executable */ 45 | #if defined(__linux__) && defined(__ELF__) 46 | .section .note.GNU-stack,"",%progbits 47 | #endif 48 | 49 | #ifdef __arm__ 50 | 51 | .text 52 | .syntax unified 53 | .fpu neon 54 | .arch armv7a 55 | .object_arch armv4 56 | .arm 57 | .altmacro 58 | .p2align 2 59 | 60 | /******************************************************************************/ 61 | 62 | .macro asm_function function_name 63 | .global \function_name 64 | .func \function_name 65 | .type \function_name, function 66 | .p2align 5 67 | \function_name: 68 | .endm 69 | 70 | /******************************************************************************/ 71 | 72 | #if !defined(MEMCPY_REPLACEMENT_SUNXI) && !defined(MEMCPY_REPLACEMENT_RPI) 73 | 74 | /* 75 | * Helper macro for memcpy function, it can copy data from source (r1) to 76 | * destination (r0) buffers fixing alignment in the process. Destination 77 | * buffer should be aligned already (4 bytes alignment is required. 78 | * Size of the block to copy is in r2 register 79 | */ 80 | .macro UNALIGNED_MEMCPY shift 81 | sub r1, #(\shift) 82 | ldr ip, [r1], #4 83 | 84 | tst r0, #4 85 | movne r3, ip, lsr #(\shift * 8) 86 | ldrne ip, [r1], #4 87 | subne r2, r2, #4 88 | orrne r3, r3, ip, asl #(32 - \shift * 8) 89 | strne r3, [r0], #4 90 | 91 | tst r0, #8 92 | movne r3, ip, lsr #(\shift * 8) 93 | ldmiane r1!, {r4, ip} 94 | subne r2, r2, #8 95 | orrne r3, r3, r4, asl #(32 - \shift * 8) 96 | movne r4, r4, lsr #(\shift * 8) 97 | orrne r4, r4, ip, asl #(32 - \shift * 8) 98 | stmiane r0!, {r3-r4} 99 | cmp r2, #32 100 | blt 3f 101 | pld [r1, #48] 102 | stmfd sp!, {r7, r8, r9, r10, r11} 103 | add r3, r1, #128 104 | bic r3, r3, #31 105 | sub r9, r3, r1 106 | 1: 107 | pld [r1, r9] 108 | subs r2, r2, #32 109 | movge r3, ip, lsr #(\shift * 8) 110 | ldmiage r1!, {r4-r6, r7, r8, r10, r11, ip} 111 | orrge r3, r3, r4, asl #(32 - \shift * 8) 112 | movge r4, r4, lsr #(\shift * 8) 113 | orrge r4, r4, r5, asl #(32 - \shift * 8) 114 | movge r5, r5, lsr #(\shift * 8) 115 | orrge r5, r5, r6, asl #(32 - \shift * 8) 116 | movge r6, r6, lsr #(\shift * 8) 117 | orrge r6, r6, r7, asl #(32 - \shift * 8) 118 | stmiage r0!, {r3-r6} 119 | movge r7, r7, lsr #(\shift * 8) 120 | orrge r7, r7, r8, asl #(32 - \shift * 8) 121 | movge r8, r8, lsr #(\shift * 8) 122 | orrge r8, r8, r10, asl #(32 - \shift * 8) 123 | movge r10, r10, lsr #(\shift * 8) 124 | orrge r10, r10, r11, asl #(32 - \shift * 8) 125 | movge r11, r11, lsr #(\shift * 8) 126 | orrge r11, r11, ip, asl #(32 - \shift * 8) 127 | stmiage r0!, {r7, r8, r10, r11} 128 | bgt 1b 129 | 2: 130 | ldmfd sp!, {r7, r8, r9, r10, r11} 131 | 3: /* copy remaining data */ 132 | tst r2, #16 133 | movne r3, ip, lsr #(\shift * 8) 134 | ldmiane r1!, {r4-r6, ip} 135 | orrne r3, r3, r4, asl #(32 - \shift * 8) 136 | movne r4, r4, lsr #(\shift * 8) 137 | orrne r4, r4, r5, asl #(32 - \shift * 8) 138 | movge r5, r5, lsr #(\shift * 8) 139 | orrge r5, r5, r6, asl #(32 - \shift * 8) 140 | movge r6, r6, lsr #(\shift * 8) 141 | orrge r6, r6, ip, asl #(32 - \shift * 8) 142 | stmiane r0!, {r3-r6} 143 | 144 | tst r2, #8 145 | movne r3, ip, lsr #(\shift * 8) 146 | ldmiane r1!, {r4, ip} 147 | orrne r3, r3, r4, asl #(32 - \shift * 8) 148 | movne r4, r4, lsr #(\shift * 8) 149 | orrne r4, r4, ip, asl #(32 - \shift * 8) 150 | stmiane r0!, {r3-r4} 151 | 152 | tst r2, #4 153 | movne r3, ip, lsr #(\shift * 8) 154 | ldrne ip, [r1], #4 155 | sub r1, r1, #(4 - \shift) 156 | orrne r3, r3, ip, asl #(32 - \shift * 8) 157 | strne r3, [r0], #4 158 | 159 | tst r2, #2 160 | ldrbne r3, [r1], #1 161 | ldrbne r4, [r1], #1 162 | ldr r5, [sp], #4 163 | strbne r3, [r0], #1 164 | strbne r4, [r0], #1 165 | 166 | tst r2, #1 167 | ldrbne r3, [r1], #1 168 | ldr r6, [sp], #4 169 | strbne r3, [r0], #1 170 | 171 | pop {r0, r4} 172 | 173 | bx lr 174 | .endm 175 | 176 | /* 177 | * Memcpy function with Raspberry Pi specific aligned prefetch, based on 178 | * https://garage.maemo.org/plugins/scmsvn/viewcvs.php/mplayer/trunk/fastmem-arm9/fastmem-arm9.S 179 | */ 180 | asm_function memcpy_armv5te 181 | cmp r2, #20 182 | blt 9f 183 | /* copy data until destination address is 4 bytes aligned */ 184 | tst r0, #1 185 | ldrbne r3, [r1], #1 186 | stmfd sp!, {r0, r4} 187 | subne r2, r2, #1 188 | strbne r3, [r0], #1 189 | tst r0, #2 190 | ldrbne r3, [r1], #1 191 | ldrbne r4, [r1], #1 192 | stmfd sp!, {r5, r6} 193 | subne r2, r2, #2 194 | orrne r3, r3, r4, asl #8 195 | strhne r3, [r0], #2 196 | /* destination address is 4 bytes aligned */ 197 | /* now we should handle 4 cases of source address alignment */ 198 | tst r1, #1 199 | bne 6f 200 | tst r1, #2 201 | bne 7f 202 | 203 | /* both source and destination are 4 bytes aligned */ 204 | stmfd sp!, {r7, r8, r9, r10, r11} 205 | tst r0, #4 206 | ldrne r4, [r1], #4 207 | subne r2, r2, #4 208 | strne r4, [r0], #4 209 | tst r0, #8 210 | ldmiane r1!, {r3-r4} 211 | add r9, r1, #96 212 | subne r2, r2, #8 213 | bic r9, r9, #31 214 | stmiane r0!, {r3-r4} 215 | sub r9, r9, r1 216 | 1: 217 | subs r2, r2, #32 218 | ldmiage r1!, {r3-r6, r7, r8, r10, r11} 219 | pld [r1, r9] 220 | stmiage r0!, {r3-r6} 221 | stmiage r0!, {r7, r8, r10, r11} 222 | bgt 1b 223 | 2: 224 | ldmfd sp!, {r7, r8, r9, r10, r11} 225 | tst r2, #16 226 | ldmiane r1!, {r3-r6} 227 | stmiane r0!, {r3-r6} 228 | tst r2, #8 229 | ldmiane r1!, {r3-r4} 230 | stmiane r0!, {r3-r4} 231 | tst r2, #4 232 | ldrne r3, [r1], #4 233 | mov ip, r0 234 | strne r3, [ip], #4 235 | tst r2, #2 236 | ldrhne r3, [r1], #2 237 | ldmfd sp!, {r5, r6} 238 | strhne r3, [ip], #2 239 | tst r2, #1 240 | ldrbne r3, [r1], #1 241 | ldmfd sp!, {r0, r4} 242 | strbne r3, [ip], #1 243 | 244 | bx lr 245 | 246 | 6: 247 | tst r1, #2 248 | bne 8f 249 | UNALIGNED_MEMCPY 1 250 | 7: 251 | UNALIGNED_MEMCPY 2 252 | 8: 253 | UNALIGNED_MEMCPY 3 254 | 9: 255 | stmfd sp!, {r0, r4} 256 | 1: subs r2, r2, #3 257 | ldrbge ip, [r0] 258 | ldrbge r3, [r1], #1 259 | ldrbge r4, [r1], #1 260 | ldrbge ip, [r1], #1 261 | strbge r3, [r0], #1 262 | strbge r4, [r0], #1 263 | strbge ip, [r0], #1 264 | bge 1b 265 | adds r2, r2, #2 266 | ldrbge r3, [r1], #1 267 | mov ip, r0 268 | ldr r0, [sp], #4 269 | strbge r3, [ip], #1 270 | ldrbgt r3, [r1], #1 271 | ldr r4, [sp], #4 272 | strbgt r3, [ip], #1 273 | bx lr 274 | .endfunc 275 | 276 | #endif 277 | 278 | /* 279 | * PRELOAD_CATCH_UP enables catching up the early preload offset with the preload offset in 280 | * the main loop. 281 | */ 282 | 283 | #define PRELOAD_CATCH_UP 284 | 285 | /* 286 | * CHECK_EARLY_PRELOADS enables checks to avoid overfetching beyond the source region when 287 | * doing early preloads. This is currently only implemented for the unaligned case. 288 | * Due to the overhead it adds this option may not improve performance. 289 | */ 290 | // #define CHECK_EARLY_PRELOADS 291 | 292 | /* 293 | * Allow unaligned memory access. 294 | */ 295 | 296 | #define UNALIGNED_ACCESS 297 | 298 | /* 299 | * Helper macro for non-overfetching version. 300 | * 301 | * If preload_early == 1, 302 | * r6 is the address of the 32-byte aligned region containing the last source byte. 303 | * r3 is the address of the 32-byte aligned region where the first preload occurred, preloads 304 | * have occurred up to [r3 + line_size]. 305 | * 306 | * Registers up to r7 have been saved on the stack. 307 | */ 308 | 309 | .macro UNALIGNED_MEMCPY_VARIANT granularity, shift, line_size, write_align, block_write_size, preload_offset, preload_early, overfetch 310 | sub r1, #(\shift) 311 | .if \preload_early == 1 312 | add r7, r3, #(\line_size * 2) 313 | .endif 314 | ldr ip, [r1], #4 315 | .if \preload_early == 1 316 | #ifdef CHECK_EARLY_PRELOADS 317 | .if \overfetch == 0 318 | cmp r6, r7 319 | /* Only preload if the source region extends into it. */ 320 | blt 5f 321 | .endif 322 | #endif 323 | pld [r7] 324 | 5: 325 | .endif 326 | 327 | tst r0, #4 328 | movne r3, ip, lsr #(\shift * 8) 329 | ldrne ip, [r1], #4 330 | subne r2, r2, #4 331 | orrne r3, r3, ip, asl #(32 - \shift * 8) 332 | strne r3, [r0], #4 333 | 334 | tst r0, #8 335 | movne r3, ip, lsr #(\shift * 8) 336 | ldmiane r1!, {r4, ip} 337 | subne r2, r2, #8 338 | orrne r3, r3, r4, asl #(32 - \shift * 8) 339 | movne r4, r4, lsr #(\shift * 8) 340 | orrne r4, r4, ip, asl #(32 - \shift * 8) 341 | stmiane r0!, {r3-r4} 342 | 343 | .if \write_align >= 32 344 | tst r0, #16 345 | movne r3, ip, lsr #(\shift * 8) 346 | beq 5f 347 | ldmia r1!, {r4-r6, ip} 348 | sub r2, r2, #16 349 | orr r3, r3, r4, asl #(32 - \shift * 8) 350 | mov r4, r4, lsr #(\shift * 8) 351 | .if \write_align == 32 352 | cmp r2, #32 353 | .endif 354 | orr r4, r4, r5, asl #(32 - \shift * 8) 355 | mov r5, r5, lsr #(\shift * 8) 356 | orr r5, r5, r6, asl #(32 - \shift * 8) 357 | mov r6, r6, lsr #(\shift * 8) 358 | orr r6, r6, ip, asl #(32 - \shift * 8) 359 | stmia r0!, {r3-r6} 360 | .if \write_align == 32 361 | blt 3f 362 | b 1f 363 | .endif 364 | 5: 365 | .endif 366 | 367 | .if \write_align == 64 368 | tst r0, #32 369 | movne r3, ip, lsr #(\shift * 8) 370 | beq 5f 371 | ldmia r1!, {r4-r6, ip} 372 | sub r2, r2, #32 373 | orr r3, r3, r4, asl #(32 - \shift * 8) 374 | mov r4, r4, lsr #(\shift * 8) 375 | cmp r2, #32 376 | orr r4, r4, r5, asl #(32 - \shift * 8) 377 | mov r5, r5, lsr #(\shift * 8) 378 | orr r5, r5, r6, asl #(32 - \shift * 8) 379 | mov r6, r6, lsr #(\shift * 8) 380 | orr r6, r6, ip, asl #(32 - \shift * 8) 381 | stmia r0!, {r3-r6} 382 | mov r3, ip, lsr #(\shift * 8) 383 | ldmia r1!, {r4-r6, ip} 384 | orr r3, r3, r4, asl #(32 - \shift * 8) 385 | mov r4, r4, lsr #(\shift * 8) 386 | orr r4, r4, r5, asl #(32 - \shift * 8) 387 | mov r5, r5, lsr #(\shift * 8) 388 | orr r5, r5, r6, asl #(32 - \shift * 8) 389 | mov r6, r6, lsr #(\shift * 8) 390 | orr r6, r6, ip, asl #(32 - \shift * 8) 391 | stmia r0!, {r3-r6} 392 | blt 3f 393 | b 1f 394 | 5: 395 | .endif 396 | 397 | cmp r2, #32 398 | blt 3f 399 | 1: 400 | .if \preload_offset != 0 401 | .if \overfetch == 1 402 | cmp r2, #64 403 | .else 404 | cmp r2, #\preload_offset 405 | .endif 406 | .endif 407 | stmfd sp!, {r8, r9, r10, r11} 408 | .if \preload_offset != 0 409 | add r10, r1, #\preload_offset 410 | #ifdef PRELOAD_CATCH_UP 411 | .if \preload_early == 1 && \preload_offset >= 64 && \block_write_size >= 16 412 | add r7, r7, #(\line_size * 2) 413 | .endif 414 | #endif 415 | bic r10, r10, #(\line_size - 1) 416 | sub r9, r10, r1 417 | .if \overfetch == 0 418 | /* If there are <= preload_offset bytes to go, skip the main loop. */ 419 | ble 4f 420 | .else 421 | blt 1f 422 | .endif 423 | .if \preload_early == 1 && \preload_offset >= 64 && \block_write_size >= 16 424 | /* 425 | * At this point, if overfetch is 0, there are at least preload_offset 426 | * bytes left, so when CHECK_EARLY_PRELOAD is set, we only need to 427 | * perform a check if it is possible that the preload overfetches, 428 | * given that the upcoming early preload is the 4th one (making a 429 | * total of line_size * 4 byte preloaded from the 32-byte aligned 430 | * start address). 431 | */ 432 | #ifdef PRELOAD_CATCH_UP 433 | #ifdef CHECK_EARLY_PRELOADS 434 | .if \preload_offset < (\line_size * 4) 435 | add r3, r1, r2 436 | mov r11, r7 437 | sub r3, r3, #1 438 | sub r7, #\line_size 439 | bic r3, r3, #(\line_size - 1) 440 | cmp r7, r3 441 | add r7, #\line_size 442 | bgt 5f 443 | pld [r7] 444 | 5: 445 | .else 446 | mov r11, r7 447 | pld [r7, #-\line_size] 448 | .endif 449 | #else 450 | mov r11, r7 451 | pld [r7, #-\line_size] 452 | #endif 453 | #else 454 | #ifdef CHECK_EARLY_PRELOADS 455 | .if \preload_offset < (\line_size * 4) 456 | add r3, r1, r2 457 | add r7, #\line_size 458 | sub r3, r3, #1 459 | bic r3, r3, #(\line_size - 1) 460 | cmp r7, r3 461 | bgt 5f 462 | pld [r7] 463 | 5: 464 | .else 465 | pld [r7, #\line_size] 466 | .endif 467 | #else 468 | pld [r7, #\line_size] 469 | #endif 470 | #endif 471 | #ifdef PRELOAD_CATCH_UP 472 | /* 473 | * The last preload already done is at [r11 - line_size]. 474 | * The next preload in the main loop will happen at [r10]. 475 | * If r11 < r10, we want to do an extra preload at [r11]. 476 | * Note if write alignment is 64, it may become unaligned. 477 | */ 478 | 18: 479 | cmp r11, r10 480 | movlt r3, ip, lsr #(\shift * 8) 481 | ldmialt r1!, {r4-r6, r7} 482 | add r11, #64 483 | orrlt r3, r3, r4, asl #(32 - \shift * 8) 484 | movlt r4, r4, lsr #(\shift * 8) 485 | bge 1f 486 | cmp r2, #(\preload_offset + 32) 487 | pld [r11, #-64] 488 | sub r2, r2, #32 489 | orr r4, r4, r5, asl #(32 - \shift * 8) 490 | mov r5, r5, lsr #(\shift * 8) 491 | orr r5, r5, r6, asl #(32 - \shift * 8) 492 | mov r6, r6, lsr #(\shift * 8) 493 | orr r6, r6, r7, asl #(32 - \shift * 8) 494 | mov r7, r7, lsr #(\shift * 8) 495 | stmia r0!, {r3-r6} 496 | mov r3, r7 497 | ldmia r1!, {r4, r5, r6, ip} 498 | orr r3, r3, r4, asl #(32 - \shift * 8) 499 | add r10, r1, r9 500 | mov r4, r4, lsr #(\shift * 8) 501 | orr r4, r4, r5, asl #(32 - \shift * 8) 502 | mov r5, r5, lsr #(\shift * 8) 503 | .if \line_size == 32 504 | pld [r11, #-32] 505 | .endif 506 | orr r5, r5, r6, asl #(32 - \shift * 8) 507 | mov r6, r6, lsr #(\shift * 8) 508 | orr r6, r6, ip, asl #(32 - \shift * 8) 509 | stmia r0!, {r3, r4, r5, r6} 510 | bgt 18b 511 | .if \overfetch == 0 512 | b 4f 513 | .endif 514 | #endif 515 | .endif 516 | 1: 517 | .if \line_size == 64 || \write_align == 64 518 | /* Process 64 bytes at a time. */ 519 | .if \overfetch == 1 520 | cmp r2, #(64 + 64) 521 | .else 522 | cmp r2, #(\preload_offset + 64) 523 | .endif 524 | pld [r1, r9] 525 | mov r3, ip, lsr #(\shift * 8) 526 | ldmia r1!, {r4-r6, r7, r8, r10, r11, ip} 527 | orr r3, r3, r4, asl #(32 - \shift * 8) 528 | mov r4, r4, lsr #(\shift * 8) 529 | sub r2, r2, #32 530 | orr r4, r4, r5, asl #(32 - \shift * 8) 531 | mov r5, r5, lsr #(\shift * 8) 532 | orr r5, r5, r6, asl #(32 - \shift * 8) 533 | mov r6, r6, lsr #(\shift * 8) 534 | orr r6, r6, r7, asl #(32 - \shift * 8) 535 | mov r7, r7, lsr #(\shift * 8) 536 | .if \block_write_size == 16 537 | stmia r0!, {r3-r6} 538 | .endif 539 | orr r7, r7, r8, asl #(32 - \shift * 8) 540 | mov r8, r8, lsr #(\shift * 8) 541 | orr r8, r8, r10, asl #(32 - \shift * 8) 542 | mov r10, r10, lsr #(\shift * 8) 543 | .if \block_write_size == 8 544 | stmia r0!, {r7-r8} 545 | .endif 546 | orr r10, r10, r11, asl #(32 - \shift * 8) 547 | mov r11, r11, lsr #(\shift * 8) 548 | orr r11, r11, ip, asl #(32 - \shift * 8) 549 | .if \block_write_size == 32 550 | stmia r0!, {r3-r6, r7, r8, r10, r11} 551 | .endif 552 | .if \block_write_size == 16 553 | stmia r0!, {r7, r8, r10, r11} 554 | .endif 555 | .if \line_size == 32 556 | pld [r1, r9] 557 | .endif 558 | mov r3, ip, lsr #(\shift * 8) 559 | ldmia r1!, {r4-r6, r7, r8, r10, r11, ip} 560 | orr r3, r3, r4, asl #(32 - \shift * 8) 561 | mov r4, r4, lsr #(\shift * 8) 562 | sub r2, r2, #32 563 | orr r4, r4, r5, asl #(32 - \shift * 8) 564 | mov r5, r5, lsr #(\shift * 8) 565 | orr r5, r5, r6, asl #(32 - \shift * 8) 566 | mov r6, r6, lsr #(\shift * 8) 567 | orr r6, r6, r7, asl #(32 - \shift * 8) 568 | mov r7, r7, lsr #(\shift * 8) 569 | .if \block_write_size == 16 570 | stmia r0!, {r3-r6} 571 | .endif 572 | orr r7, r7, r8, asl #(32 - \shift * 8) 573 | mov r8, r8, lsr #(\shift * 8) 574 | orr r8, r8, r10, asl #(32 - \shift * 8) 575 | mov r10, r10, lsr #(\shift * 8) 576 | .if \block_write_size == 8 577 | stmia r0!, {r7-r8} 578 | .endif 579 | orr r10, r10, r11, asl #(32 - \shift * 8) 580 | mov r11, r11, lsr #(\shift * 8) 581 | orr r11, r11, ip, asl #(32 - \shift * 8) 582 | .if \block_write_size == 32 583 | stmia r0!, {r3-r6, r7, r8, r10, r11} 584 | .endif 585 | .if \block_write_size == 16 586 | stmia r0!, {r7, r8, r10, r11} 587 | .endif 588 | .else 589 | /* Process 32 bytes at a time. */ 590 | .if \overfetch == 1 591 | cmp r2, #(32 + 32) 592 | .else 593 | cmp r2, #(\preload_offset + 32) 594 | .endif 595 | pld [r1, r9] 596 | mov r3, ip, lsr #(\shift * 8) 597 | ldmia r1!, {r4-r6, r7, r8, r10, r11, ip} 598 | orr r3, r3, r4, asl #(32 - \shift * 8) 599 | mov r4, r4, lsr #(\shift * 8) 600 | sub r2, r2, #32 601 | orr r4, r4, r5, asl #(32 - \shift * 8) 602 | mov r5, r5, lsr #(\shift * 8) 603 | .if \block_write_size == 8 604 | stmia r0!, {r3-r4} 605 | .endif 606 | orr r5, r5, r6, asl #(32 - \shift * 8) 607 | mov r6, r6, lsr #(\shift * 8) 608 | orr r6, r6, r7, asl #(32 - \shift * 8) 609 | mov r7, r7, lsr #(\shift * 8) 610 | .if \block_write_size == 16 611 | stmia r0!, {r3-r6} 612 | .endif 613 | .if \block_write_size == 8 614 | stmia r0!, {r5-r6} 615 | .endif 616 | orr r7, r7, r8, asl #(32 - \shift * 8) 617 | mov r8, r8, lsr #(\shift * 8) 618 | orr r8, r8, r10, asl #(32 - \shift * 8) 619 | mov r10, r10, lsr #(\shift * 8) 620 | .if \block_write_size == 8 621 | stmia r0!, {r7-r8} 622 | .endif 623 | orr r10, r10, r11, asl #(32 - \shift * 8) 624 | mov r11, r11, lsr #(\shift * 8) 625 | orr r11, r11, ip, asl #(32 - \shift * 8) 626 | .if \block_write_size == 32 627 | stmia r0!, {r3-r6, r7, r8, r10, r11} 628 | .endif 629 | .if \block_write_size == 16 630 | stmia r0!, {r7, r8, r10, r11} 631 | .endif 632 | .if \block_write_size == 8 633 | stmia r0!, {r10-r11} 634 | .endif 635 | .endif 636 | bge 1b 637 | .endif /* preload_offset != 0 */ 638 | .if \overfetch == 0 639 | 4: 640 | cmp r2, #(32 + 32) 641 | mov r3, ip, lsr #(\shift * 8) 642 | ldmia r1!, {r4-r6, r7, r8, r10, r11, ip} 643 | orr r3, r3, r4, asl #(32 - \shift * 8) 644 | sub r2, r2, #32 645 | mov r4, r4, lsr #(\shift * 8) 646 | orr r4, r4, r5, asl #(32 - \shift * 8) 647 | mov r5, r5, lsr #(\shift * 8) 648 | .if \block_write_size == 8 649 | stmia r0!, {r3-r4} 650 | .endif 651 | orr r5, r5, r6, asl #(32 - \shift * 8) 652 | mov r6, r6, lsr #(\shift * 8) 653 | orr r6, r6, r7, asl #(32 - \shift * 8) 654 | mov r7, r7, lsr #(\shift * 8) 655 | .if \block_write_size == 16 656 | stmia r0!, {r3-r6} 657 | .endif 658 | .if \block_write_size == 8 659 | stmia r0!, {r5-r6} 660 | .endif 661 | orr r7, r7, r8, asl #(32 - \shift * 8) 662 | mov r8, r8, lsr #(\shift * 8) 663 | orr r8, r8, r10, asl #(32 - \shift * 8) 664 | mov r10, r10, lsr #(\shift * 8) 665 | .if \block_write_size == 8 666 | stmia r0!, {r7-r8} 667 | .endif 668 | orr r10, r10, r11, asl #(32 - \shift * 8) 669 | mov r11, r11, lsr #(\shift * 8) 670 | orr r11, r11, ip, asl #(32 - \shift * 8) 671 | .if \block_write_size == 32 672 | stmia r0!, {r3-r6, r7, r8, r10, r11} 673 | .endif 674 | .if \block_write_size == 16 675 | stmia r0!, {r7, r8, r10, r11} 676 | .endif 677 | .if \block_write_size == 8 678 | stmia r0!, {r10-r11} 679 | .endif 680 | bge 4b 681 | .endif /* overfetch == 0 */ 682 | 21: 683 | ldmfd sp!, {r8, r9, r10, r11} 684 | 3: /* copy remaining data */ 685 | tst r2, #16 686 | ldmfd sp!, {r7} 687 | mov r3, ip, lsr #(\shift * 8) 688 | beq 1f 689 | ldmia r1!, {r4-r6, ip} 690 | orr r3, r3, r4, asl #(32 - \shift * 8) 691 | mov r4, r4, lsr #(\shift * 8) 692 | orr r4, r4, r5, asl #(32 - \shift * 8) 693 | mov r5, r5, lsr #(\shift * 8) 694 | orr r5, r5, r6, asl #(32 - \shift * 8) 695 | mov r6, r6, lsr #(\shift * 8) 696 | orr r6, r6, ip, asl #(32 - \shift * 8) 697 | stmia r0!, {r3-r6} 698 | 1: 699 | tst r2, #8 700 | movne r3, ip, lsr #(\shift * 8) 701 | ldmiane r1!, {r4, ip} 702 | orrne r3, r3, r4, asl #(32 - \shift * 8) 703 | movne r4, r4, lsr #(\shift * 8) 704 | orrne r4, r4, ip, asl #(32 - \shift * 8) 705 | stmiane r0!, {r3-r4} 706 | 707 | tst r2, #4 708 | movne r3, ip, lsr #(\shift * 8) 709 | ldrne ip, [r1], #4 710 | sub r1, r1, #(4 - \shift) 711 | orrne r3, r3, ip, asl #(32 - \shift * 8) 712 | strne r3, [r0], #4 713 | 714 | .if \granularity <= 2 715 | tst r2, #2 716 | ldrbne r3, [r1], #1 717 | ldrbne r4, [r1], #1 718 | .endif 719 | ldr r5, [sp], #4 720 | .if \granularity <= 2 721 | strbne r3, [r0], #1 722 | strbne r4, [r0], #1 723 | .endif 724 | 725 | .if \granularity == 1 726 | tst r2, #1 727 | ldrbne r3, [r1], #1 728 | .endif 729 | ldr r6, [sp], #4 730 | .if \granularity == 1 731 | strbne r3, [r0], #1 732 | .endif 733 | 734 | pop {r0, r4} 735 | 736 | bx lr 737 | .endm 738 | 739 | 740 | /* 741 | * Macro that defines the main body of a memcpy version with optional no over-fetching 742 | * beyond the source memory region. 743 | * 744 | * granularity must be 1, 2 or 4. This value is 1 for normal memcpy, 2 for operations on half-word 745 | * aligned regions such as 16bpp framebuffers/images, and 4 for operations on word aligned regions 746 | * such as 32bpp framebuffers\images. 747 | * line_size must be 32 or 64. 748 | * write_align must be 32 or 16, or 64. 749 | * block_write_size must be 32, 16 or 8. 750 | * preload_offset must be a multiple of 32, 96 was the default setting. When preload_offset is 0, 751 | * no preload instructions will be generated at all. 752 | * preload_early must be 0 or 1. 753 | * overfetch must be 0 or 1. 754 | * 755 | * If line_size is 64, write_align must be 32 or 64, block_write_size must be 32, and preload_offset 756 | * must be a multiple of 64. 757 | * 758 | * If line_size is 64 or write_align is 64, overfetch must be 0. 759 | */ 760 | 761 | .macro MEMCPY_VARIANT granularity, line_size, write_align, block_write_size, preload_offset, preload_early, overfetch 762 | cmp r2, #52 763 | bic r3, r1, #(\line_size - 1) 764 | .if \preload_early == 1 765 | pld [r3] 766 | .endif 767 | /* Jump if we have a large size. */ 768 | bge 1f 769 | 770 | .if \granularity <= 2 771 | /* 772 | * Small sizes. Test whether both source and destination are word aligned. 773 | */ 774 | tst r0, #3 775 | andseq r3, r1, #3 776 | /* If not, jump to the unaligned code for small sizes */ 777 | mov ip, r0 778 | bne 9f 779 | .else 780 | mov ip, r0 781 | .endif 782 | 783 | /* Copy words. Fast path for small sizes with word aligned src and dest. */ 784 | /* ip must be equal to the original r0. */ 785 | 29: 786 | 22: 787 | cmp r2, #8 788 | ldrge r3, [r1], #4 789 | strge r3, [r0], #4 790 | ldrge r3, [r1], #4 791 | sub r2, r2, #8 792 | strge r3, [r0], #4 793 | bgt 22b 794 | moveq r0,ip 795 | bxeq lr 796 | tst r2, #4 797 | ldrne r3, [r1], #4 798 | strne r3, [r0], #4 799 | tst r2, #3 800 | moveq r0, ip 801 | bxeq lr 802 | .if \granularity <= 2 803 | tst r2, #2 804 | ldrhne r3, [r1], #2 805 | strhne r3, [r0], #2 806 | .endif 807 | .if \granularity == 1 808 | tst r2, #1 809 | ldrbne r3, [r1] 810 | strbne r3, [r0] 811 | .endif 812 | mov r0, ip 813 | bx lr 814 | 815 | 1: 816 | /* 817 | * Larger sizes. Copy data until destination address is 4 bytes aligned. 818 | * Optimize the common case in which both source and destination are 819 | * are already word-aligned. 820 | */ 821 | .if \granularity == 1 822 | tst r0, #3 823 | stmfd sp!, {r0, r4} 824 | andseq r3, r1, #3 825 | stmfd sp!, {r5, r6} 826 | #ifdef CHECK_EARLY_PRELOADS 827 | .if \preload_early == 1 828 | /* Determine the 32-byte aligned address of the last byte. */ 829 | addeq r6, r1, r2 830 | .endif 831 | #endif 832 | beq 2f 833 | .else 834 | stmfd sp!, {r0, r4} 835 | stmfd sp!, {r5, r6} 836 | .endif 837 | 838 | .if \granularity == 1 839 | tst r0, #1 840 | ldrbne r4, [r1], #1 841 | subne r2, r2, #1 842 | strbne r4, [r0], #1 843 | .endif 844 | 845 | .if \granularity <= 2 846 | tst r0, #2 847 | ldrbne r4, [r1], #1 848 | .endif 849 | .if \granularity <= 2 850 | ldrbne r5, [r1], #1 851 | subne r2, r2, #2 852 | orrne r4, r4, r5, asl #8 853 | .endif 854 | #ifdef CHECK_EARLY_PRELOADS 855 | .if \preload_early == 1 856 | /* Determine the 32-byte aligned address of the last byte. */ 857 | add r6, r1, r2 858 | .endif 859 | #endif 860 | .if \granularity <= 2 861 | strhne r4, [r0], #2 862 | .endif 863 | /* destination address is 4 bytes aligned */ 864 | 865 | .if \granularity == 1 866 | tst r1, #1 867 | .endif 868 | #ifdef CHECK_EARLY_PRELOADS 869 | .if \preload_early == 1 870 | sub r6, r6, #1 871 | .endif 872 | #endif 873 | #ifdef CHECK_EARLY_PRELOADS 874 | .if \preload_early == 1 875 | bic r6, r6, #(\line_size - 1) 876 | .endif 877 | #endif 878 | 879 | /* now we should handle 4 cases of source address alignment */ 880 | .if \granularity == 1 881 | bne 6f 882 | .endif 883 | .if \granularity <= 2 884 | tst r1, #2 885 | .endif 886 | stmfd sp!, {r7} 887 | .if \granularity <= 2 888 | bne 7f 889 | .endif 890 | tst r0, #4 891 | b 3f 892 | 893 | 2: 894 | /* Further optimize for the 16-byte aligned case. */ 895 | tst r0, #12 896 | #ifdef CHECK_EARLY_PRELOADS 897 | .if \preload_early == 1 898 | sub r6, r6, #1 899 | .endif 900 | #endif 901 | .if \preload_early == 1 902 | pld [r3, #\line_size] 903 | .endif 904 | #ifdef CHECK_EARLY_PRELOADS 905 | .if \preload_early == 1 906 | bic r6, r6, #(\line_size - 1) 907 | .endif 908 | #endif 909 | stmfd sp!, {r7} 910 | beq 1f 911 | tst r0, #4 912 | 913 | 3: 914 | /* both source and destination are 4 bytes aligned */ 915 | #ifdef CHECK_EARLY_PRELOADS 916 | .if \preload_early == 1 917 | mov ip, r6 918 | .endif 919 | #endif 920 | ldrne r5, [r1], #4 921 | subne r2, r2, #4 922 | strne r5, [r0], #4 923 | tst r0, #8 924 | ldmiane r1!, {r4, r5} 925 | subne r2, r2, #8 926 | stmiane r0!, {r4, r5} 927 | 1: 928 | .if \write_align >= 32 929 | tst r0, #16 930 | ldmiane r1!, {r4-r7} 931 | subne r2, r2, #16 932 | stmiane r0!, {r4-r7} 933 | .endif 934 | .if \write_align == 64 935 | tst r0, #32 936 | ldmiane r1!, {r4-r7} 937 | subne r2, r2, #32 938 | stmiane r0!, {r4-r7} 939 | ldmiane r1!, {r4-r7} 940 | stmiane r0!, {r4-r7} 941 | .endif 942 | /* Source is now write_align bytes aligned. */ 943 | 944 | /* 945 | * The chunk size is defined is 64 if write_align == 64 or line_size = 64; 946 | * otherwise, it is equal to write_align. 947 | * If the number of bytes left is smaller than the chunk size, skip all loops. 948 | * If the number of bytes left is <= (preload_offset + chunk_size), skip the 949 | * loop with preload and jump to the loop without preload. 950 | * Also calculate the preload offset in r9 and the address of the next main loop preload 951 | * in r5 if early preload is enabled and PRELOAD_CATCH_UP is set. 952 | * If preload is enabled, r3 is updated to hold the address of the next early preload. 953 | */ 954 | .if \preload_offset == 0 955 | cmp r2, #32 956 | blt 14f 957 | stmfd sp!, {r8, r9, r10, r11} 958 | .elseif \write_align == 64 || \line_size == 64 959 | cmp r2, #64 960 | .if \line_size == 64 && \write_align == 32 961 | add r5, r1, #\preload_offset 962 | .endif 963 | .if \preload_early == 1 964 | pld [r3, #(\line_size * 2)] 965 | #ifdef PRELOAD_CATCH_UP 966 | add r3, #(\line_size * 3) 967 | #endif 968 | .endif 969 | .if \line_size == 64 && \write_align == 32 970 | bic r5, r5, #63 971 | .else 972 | #ifdef PRELOAD_CATCH_UP 973 | .if \preload_early == 1 974 | add r5, r1, #\preload_offset 975 | .endif 976 | #endif 977 | .endif 978 | blt 2f 979 | cmp r2, #(\preload_offset + 64) 980 | stmfd sp!, {r8, r9, r10, r11} 981 | .if \line_size == 64 && \write_align == 32 982 | sub r9, r5, r1 983 | .else 984 | mov r9, #\preload_offset 985 | .endif 986 | .if \overfetch == 1 987 | ble 1f 988 | .else 989 | ble 10f 990 | .endif 991 | .elseif \write_align == 32 992 | /* In the case of line_size == 32 and write_align == 32 r9 will be equal to preload_offset. */ 993 | cmp r2, #32 994 | .if \preload_early == 1 995 | pld [r3, #(\line_size * 2)] 996 | #ifdef PRELOAD_CATCH_UP 997 | add r3, #(\line_size * 3) 998 | #endif 999 | .endif 1000 | #ifdef PRELOAD_CATCH_UP 1001 | .if \preload_early == 1 1002 | add r5, r1, #\preload_offset 1003 | .endif 1004 | #endif 1005 | blt 14f 1006 | cmp r2, #(\preload_offset + 32) 1007 | stmfd sp!, {r8, r9, r10, r11} 1008 | mov r9, #\preload_offset 1009 | .if \overfetch == 1 1010 | ble 1f 1011 | .else 1012 | ble 10f 1013 | .endif 1014 | .else // write_align == 16 1015 | cmp r2, #32 1016 | add r5, r1, #\preload_offset 1017 | .if \preload_early == 1 1018 | pld [r3, #(\line_size * 2)] 1019 | #ifdef PRELOAD_CATCH_UP 1020 | add r3, #(\line_size * 3) 1021 | #endif 1022 | .endif 1023 | bic r5, r5, #31 1024 | /* If there are less than 32 bytes to go, skip all loops. */ 1025 | blt 14f 1026 | cmp r2, #(\preload_offset + 32) 1027 | stmfd sp!, {r8, r9, r10, r11} 1028 | sub r9, r5, r1 1029 | /* If there are <= (preload_offset + 32) bytes to go, skip the main loop. */ 1030 | .if \overfetch == 1 1031 | ble 1f 1032 | .else 1033 | ble 10f 1034 | .endif 1035 | .endif 1036 | 1037 | .if \preload_offset != 0 1038 | .if \preload_early == 1 1039 | #ifndef PRELOAD_CATCH_UP 1040 | pld [r3, #(\line_size * 3)] 1041 | #else 1042 | .if \block_write_size >= 16 && \preload_offset >= 96 1043 | /* 1044 | * The last preload already done is at [r3 - line_size]. 1045 | * The next preload in the main loop will happen at [r5 + line_size]. 1046 | * If there are line-sized chunks in between that we have not yet preloaded, 1047 | * we want to do preloads for them. 1048 | */ 1049 | cmp r3, r5 1050 | bge 1f 1051 | #if 0 1052 | /* Implement catch-up using a simple loop. */ 1053 | add r3, r3, #\line_size 1054 | 13: 1055 | cmp r3, r5 1056 | pld [r3, #-\line_size] 1057 | add r3, r3, #\line_size 1058 | blt 13b 1059 | #else 1060 | /* 1061 | * Implement catch-up while processing chunks. block_write_size of 32 1062 | * uses 16-byte writes because of a lack of registers. 1063 | * Note: if line_size is 64 and write alignment is 64, we have to be 1064 | * careful that write alignment remains 64 bytes. 1065 | */ 1066 | pld [r3] 1067 | add r3, r3, #\line_size 1068 | 13: 1069 | cmp r3, r5 1070 | ldmialt r1!, {r7, r8, r10, r11} 1071 | addlt r5, r5, #32 1072 | bge 1f 1073 | .if \line_size == 64 || \write_align == 64 1074 | cmp r2, #(\preload_offset + 64 + 32) 1075 | .else 1076 | cmp r2, #(\preload_offset + 32 + 32) 1077 | .endif 1078 | stmia r0!, {r7, r8, r10, r11} 1079 | pld [r3] 1080 | add r3, r3, #64 1081 | ldmia r1!, {r7, r8, r10, r11} 1082 | sub r2, r2, #32 1083 | stmia r0!, {r7, r8, r10, r11} 1084 | .if \line_size == 32 1085 | pld [r3, #-\line_size] 1086 | .endif 1087 | .if \line_size != 64 || \write_align != 64 1088 | bgt 13b 1089 | .if \overfetch == 0 1090 | b 10f 1091 | .endif 1092 | .else 1093 | /* 1094 | * If line_size is 64 and write_align is 64, make sure 1095 | * the write alignment of 64 maintained. 1096 | * 1097 | * Jump if we don't need to do preloads anymore; 64-byte write 1098 | * alignment is not important in this case. 1099 | */ 1100 | add r5, r5, #32 1101 | ble 10f 1102 | cmp r3, r5 1103 | ldmia r1!, {r7, r8, r10, r11} 1104 | /* In case of a jump, we will be doing more preloads so we */ 1105 | /* have to ensure 64 bytes write alignment. */ 1106 | bge 5f 1107 | cmp r2, #(\preload_offset + 64 + 32) 1108 | stmia r0!, {r7, r8, r10, r11} 1109 | pld [r3] 1110 | add r3, r3, #64 1111 | ldmia r1!, {r7, r8, r10, r11} 1112 | sub r2, r2, #32 1113 | stmia r0!, {r7, r8, r10, r11} 1114 | bgt 13b 1115 | .if \overfetch == 0 1116 | b 10f 1117 | .endif 1118 | .endif /* line_size == 64 write_alignment == 64 */ 1119 | #endif 1120 | .else 1121 | pld [r3] 1122 | .endif 1123 | #endif 1124 | .endif /* preload_early == 1 */ 1125 | 1: 1126 | .if \line_size == 64 || \write_align == 64 1127 | .if \overfetch == 1 1128 | cmp r2, #(64 + 64) 1129 | .else 1130 | cmp r2, #(\preload_offset + 64 + 64) 1131 | .endif 1132 | ldmia r1!, {r3-r6, r7, r8, r10, r11} 1133 | stmia r0!, {r3-r6, r7, r8, r10, r11} 1134 | .if \line_size == 32 1135 | pld [r1, r9] 1136 | .endif 1137 | ldmia r1!, {r3-r6, r7, r8, r10, r11} 1138 | sub r2, r2, #64 1139 | stmia r0!, {r3-r6, r7, r8, r10, r11} 1140 | pld [r1, r9] 1141 | .else 1142 | .if \overfetch == 1 1143 | cmp r2, #(32 + 32) 1144 | .else 1145 | cmp r2, #(\preload_offset + 32 + 32) 1146 | .endif 1147 | .if \block_write_size == 32 1148 | ldmia r1!, {r3-r6, r7, r8, r10, r11} 1149 | sub r2, r2, #32 1150 | stmia r0!, {r3-r6, r7, r8, r10, r11} 1151 | pld [r1, r9] 1152 | .endif 1153 | .if \block_write_size == 16 1154 | ldmia r1!, {r3-r6, r7, r8, r10, r11} 1155 | sub r2, r2, #32 1156 | stmia r0!, {r3-r6} 1157 | pld [r1, r9] 1158 | stmia r0!, {r7, r8, r10, r11} 1159 | .endif 1160 | .if \block_write_size == 8 1161 | ldmia r1!, {r3-r6, r7, r8, r10, r11} 1162 | sub r2, r2, #32 1163 | stmia r0!, {r3-r4} 1164 | stmia r0!, {r5-r6} 1165 | pld [r1, r9] 1166 | stmia r0!, {r7-r8} 1167 | stmia r0!, {r10-r11} 1168 | .endif 1169 | .endif /* line_size == 64 */ 1170 | bge 1b 1171 | .endif /* preload_offset != 0 */ 1172 | .if \overfetch == 0 1173 | 10: 1174 | .if \line_size == 64 || \write_align == 64 1175 | cmp r2, #(64 + 64) 1176 | ldmia r1!, {r3-r6, r7, r8, r10, r11} 1177 | sub r2, r2, #64 1178 | stmia r0!, {r3-r6, r7, r8, r10, r11} 1179 | ldmia r1!, {r3-r6, r7, r8, r10, r11} 1180 | stmia r0!, {r3-r6, r7, r8, r10, r11} 1181 | .else 1182 | .if \block_write_size == 32 1183 | cmp r2, #(32 + 32) 1184 | ldmia r1!, {r3-r6, r7, r8, r10, r11} 1185 | sub r2, r2, #32 1186 | stmia r0!, {r3-r6, r7, r8, r10, r11} 1187 | .endif 1188 | .if \block_write_size == 16 1189 | cmp r2, #(32 + 32) 1190 | ldmia r1!, {r3-r6, r7, r8, r10, r11} 1191 | sub r2, r2, #32 1192 | stmia r0!, {r3-r6} 1193 | stmia r0!, {r7, r8, r10, r11} 1194 | .endif 1195 | .if \block_write_size == 8 1196 | cmp r2, #(32 + 32) 1197 | ldmia r1!, {r3-r6, r7, r8, r10, r11} 1198 | sub r2, r2, #32 1199 | stmia r0!, {r3-r4} 1200 | stmia r0!, {r5-r6} 1201 | stmia r0!, {r7-r8} 1202 | stmia r0!, {r10-r11} 1203 | .endif 1204 | .endif /* line_size == 64 || write_align == 64 */ 1205 | bge 10b 1206 | .endif /* overfetch == 0 */ 1207 | ldmfd sp!, {r8, r9, r10, r11} 1208 | 2: 1209 | .if \line_size == 64 || \write_align == 64 1210 | tst r2, #32 1211 | ldmiane r1!, {r3-r6} 1212 | stmiane r0!, {r3-r6} 1213 | ldmiane r1!, {r3-r6} 1214 | stmiane r0!, {r3-r6} 1215 | .endif 1216 | 14: 1217 | tst r2, #16 1218 | ldmfd sp!, {r7} 1219 | ldmiane r1!, {r3-r6} 1220 | stmiane r0!, {r3-r6} 1221 | 3: 1222 | tst r2, #8 1223 | ldmiane r1!, {r3-r4} 1224 | stmiane r0!, {r3-r4} 1225 | tst r2, #4 1226 | ldrne r3, [r1], #4 1227 | mov ip, r0 1228 | strne r3, [ip], #4 1229 | .if \granularity == 1 1230 | /* Optimize for the word-sized case. */ 1231 | tst r2, #3 1232 | ldmfdeq sp!, {r5, r6} 1233 | ldmfdeq sp!, {r0, r4} 1234 | bxeq lr 1235 | .endif 1236 | .if \granularity <= 2 1237 | tst r2, #2 1238 | ldrhne r3, [r1], #2 1239 | .endif 1240 | ldmfd sp!, {r5, r6} 1241 | .if \granularity <= 2 1242 | strhne r3, [ip], #2 1243 | .endif 1244 | .if \granularity == 1 1245 | tst r2, #1 1246 | ldrbne r3, [r1] 1247 | .endif 1248 | ldmfd sp!, {r0, r4} 1249 | .if \granularity == 1 1250 | strbne r3, [ip] 1251 | .endif 1252 | bx lr 1253 | 5: 1254 | /* We get here in case we need to fix write alignment to 64 bytes. */ 1255 | stmia r0!, {r7, r8, r10, r11} 1256 | ldmia r1!, {r7, r8, r10, r11} 1257 | sub r2, r2, #32 1258 | stmia r0!, {r7, r8, r10, r11} 1259 | b 1b 1260 | .if \granularity == 1 1261 | 6: 1262 | tst r1, #2 1263 | stmfd sp!, {r7} 1264 | bne 8f 1265 | UNALIGNED_MEMCPY_VARIANT \granularity, 1, \line_size, \write_align, \block_write_size, \preload_offset, \preload_early, \overfetch 1266 | .endif 1267 | 7: 1268 | UNALIGNED_MEMCPY_VARIANT \granularity, 2, \line_size, \write_align, \block_write_size, \preload_offset, \preload_early, \overfetch 1269 | .if \granularity == 1 1270 | 8: 1271 | UNALIGNED_MEMCPY_VARIANT \granularity, 3, \line_size, \write_align, \block_write_size, \preload_offset, \preload_early, \overfetch 1272 | .endif 1273 | 1274 | .p2align 4 1275 | .if \granularity <= 2 1276 | 9: 1277 | cmp r2, #8 1278 | push {r0} 1279 | blt 1f /* Jump to special case for really small sizes. */ 1280 | 1281 | /* copy data until destination address is 4 bytes aligned */ 1282 | .if \granularity == 1 1283 | tst r0, #1 1284 | ldrbne r3, [r1], #1 1285 | subne r2, r2, #1 1286 | strbne r3, [r0], #1 1287 | .endif 1288 | 1289 | tst r0, #2 1290 | ldrbne r3, [r1], #1 1291 | ldrbne ip, [r1], #1 1292 | subne r2, r2, #2 1293 | orrne r3, r3, ip, asl #8 1294 | strhne r3, [r0], #2 1295 | /* destination address is 4 bytes aligned */ 1296 | 1297 | /* now we should handle four cases of source address alignment */ 1298 | .if \granularity == 1 1299 | tst r1, #1 1300 | bne 25f 1301 | .endif 1302 | tst r1, #2 1303 | popeq {ip} 1304 | beq 29b /* Jump if the source is word aligned. */ 1305 | 1306 | /* shift 2 */ 1307 | // sub r1, r1, #2 1308 | // ldr ip, [r1], #4 1309 | ldr ip, [r1, #-2] 1310 | add r1, r1, #2 1311 | 23: 1312 | subs r2, r2, #4 1313 | movge r3, ip, lsr #(2 * 8) 1314 | ldrge ip, [r1], #4 1315 | orrge r3, r3, ip, asl #(32 - 2 * 8) 1316 | strge r3, [r0], #4 1317 | bge 23b 1318 | 1319 | sub r1, r1, #2 1320 | tst r2, #2 1321 | ldrbne r3, [r1], #1 1322 | ldrbne ip, [r1], #1 1323 | strbne r3, [r0], #1 1324 | strbne ip, [r0], #1 1325 | 1326 | .if \granularity == 1 1327 | tst r2, #1 1328 | mov ip, r0 1329 | ldrbne r3, [r1] 1330 | ldr r0, [sp], #4 1331 | strbne r3, [ip] 1332 | .else 1333 | pop {r0} 1334 | .endif 1335 | bx lr 1336 | 1337 | /* Handle sizes < 8 */ 1338 | 1: 1339 | .if \granularity == 2 1340 | tst r2, #4 1341 | ldrhne r3, [r1], #2 1342 | ldrhne ip, [r1], #2 1343 | strhne r3, [r0], #2 1344 | strhne ip, [r0], #2 1345 | test r2, #2 1346 | mov ip, r0 1347 | ldrhne r3, [r1] 1348 | pop {r0} 1349 | strhne r3, [ip] 1350 | .else 1351 | tst r2, #4 1352 | ldrbne r3, [r1], #1 1353 | beq 2f 1354 | ldrb ip, [r1], #1 1355 | strb r3, [r0], #1 1356 | strb ip, [r0], #1 1357 | ldrb r3, [r1], #1 1358 | ldrb ip, [r1], #1 1359 | strb r3, [r0], #1 1360 | strb ip, [r0], #1 1361 | 2: 1362 | tst r2, #2 1363 | ldrbne r3, [r1], #1 1364 | ldrbne ip, [r1], #1 1365 | strbne r3, [r0], #1 1366 | strbne ip, [r0], #1 1367 | tst r2, #1 1368 | mov ip, r0 1369 | ldrbne r3, [r1] 1370 | pop {r0} 1371 | strbne r3, [ip] 1372 | .endif 1373 | bx lr 1374 | 1375 | .if \granularity == 1 1376 | 24: 1377 | /* shift 1 */ 1378 | // sub r1, r1, #1 1379 | // ldr ip, [r1], #4 1380 | ldr ip, [r1, #-1] 1381 | add r1, r1, #3 1382 | 27: 1383 | subs r2, r2, #4 1384 | movge r3, ip, lsr #(1 * 8) 1385 | ldrge ip, [r1], #4 1386 | orrge r3, r3, ip, asl #(32 - 1 * 8) 1387 | strge r3, [r0], #4 1388 | bge 27b 1389 | 1390 | sub r1, r1, #3 1391 | tst r2, #2 1392 | ldrbne r3, [r1], #1 1393 | ldrbne ip, [r1], #1 1394 | strbne r3, [r0], #1 1395 | strbne ip, [r0], #1 1396 | 1397 | tst r2, #1 1398 | mov ip, r0 1399 | ldrbne r3, [r1] 1400 | ldr r0, [sp], #4 1401 | strbne r3, [ip] 1402 | bx lr 1403 | 1404 | 25: 1405 | tst r1, #2 1406 | beq 24b /* shift 1 */ 1407 | 1408 | /* shift 3 */ 1409 | 26: 1410 | // sub r1, r1, #3 1411 | // ldr ip, [r1], #4 1412 | ldr ip, [r1, #-3] 1413 | add r1, r1, #1 1414 | 28: 1415 | subs r2, r2, #4 1416 | movge r3, ip, lsr #(3 * 8) 1417 | ldrge ip, [r1], #4 1418 | orrge r3, r3, ip, asl #(32 - 3 * 8) 1419 | strge r3, [r0], #4 1420 | bge 28b 1421 | 1422 | sub r1, r1, #1 1423 | tst r2, #2 1424 | ldrbne r3, [r1], #1 1425 | ldrbne ip, [r1], #1 1426 | strbne r3, [r0], #1 1427 | strbne ip, [r0], #1 1428 | 1429 | tst r2, #1 1430 | mov ip, r0 1431 | ldrbne r3, [r1] 1432 | ldr r0, [sp], #4 1433 | strbne r3, [ip] 1434 | bx lr 1435 | .endif /* granularity == 1 */ 1436 | .endif /* granularity <= 2 */ 1437 | 1438 | .endm 1439 | 1440 | /* 1441 | * The following macros implement a simpler memcpy that is optimized with a fast path 1442 | * for common cases and may use unaligned access for small sizes. 1443 | * 1444 | * line_size of 64 or 32 is supported, write_align must be 32 or 16, block_write_size 1445 | * must be 32 or 16, early_preload and overfetch are enabled. 1446 | */ 1447 | 1448 | .macro MEMCPY_VARIANT_SIMPLE_WRITE_ALIGN write_align 1449 | /* Align to a 16-byte or 32-byte boundary. */ 1450 | tst r0, #4 1451 | ldrne r3, [r1], #4 1452 | subne r2, r2, #4 1453 | strne r3, [r0], #4 1454 | tst r0, #8 1455 | ldrne r3, [r1], #4 1456 | ldrne ip, [r1], #4 1457 | subne r2, r2, #8 1458 | strne r3, [r0], #4 1459 | strne ip, [r0], #4 1460 | .if \write_align >= 32 1461 | tst r0, #16 1462 | ldrne r3, [r1] 1463 | beq 31f 1464 | ldr ip, [r1, #4] 1465 | str r3, [r0] 1466 | sub r2, r2, #16 1467 | str ip, [r0, #4] 1468 | ldr r3, [r1, #8] 1469 | ldr ip, [r1, #12] 1470 | add r1, #16 1471 | str r3, [r0, #8] 1472 | str ip, [r0, #12] 1473 | add r0, #16 1474 | 31: 1475 | .endif 1476 | .if \write_align == 64 1477 | tst r0, #32 1478 | ldmiane r1!, {r3, ip} 1479 | beq 32f 1480 | stmia r0!, {r3, ip} 1481 | ldmia r1!, {r3, ip} 1482 | stmia r0!, {r3, ip} 1483 | ldmia r1!, {r3, ip} 1484 | stmia r0!, {r3, ip} 1485 | ldmia r1!, {r3, ip} 1486 | sub r2, r2, #32 1487 | stmia r0!, {r3, ip} 1488 | 32: 1489 | .endif 1490 | .endm 1491 | 1492 | .macro MEMCPY_VARIANT_SIMPLE_WRITE_ALIGN_CUSTOM 1493 | /* 1494 | * Align destination to a 16-byte or 32-byte boundary, 1495 | * depending on whether the 32-byte alignment of the 1496 | * source is optimal. 1497 | */ 1498 | tst r0, #4 1499 | ldrne r3, [r1], #4 1500 | subne r2, r2, #4 1501 | strne r3, [r0], #4 1502 | tst r0, #8 1503 | ldrne r3, [r1], #4 1504 | ldrne ip, [r1], #4 1505 | subne r2, r2, #8 1506 | strne r3, [r0], #4 1507 | strne ip, [r0], #4 1508 | /* 1509 | * If (source & 16) is zero, allow write aligning to 32 bytes. 1510 | * This improves performance. 1511 | */ 1512 | eor r3, r1, r0 1513 | tst r0, #16 1514 | tstne r3, #16 1515 | ldrne r3, [r1] 1516 | beq 31f 1517 | ldr ip, [r1, #4] 1518 | str r3, [r0] 1519 | sub r2, r2, #16 1520 | str ip, [r0, #4] 1521 | ldr r3, [r1, #8] 1522 | ldr ip, [r1, #12] 1523 | add r1, #16 1524 | str r3, [r0, #8] 1525 | str ip, [r0, #12] 1526 | add r0, #16 1527 | 31: 1528 | .endm 1529 | 1530 | .macro MEMCPY_VARIANT_SIMPLE_UNALIGNED_MAIN_PART shift, line_size, write_align, block_write_size, preload_offset, custom_write_align 1531 | ldr ip, [r1, #(-\shift)] 1532 | add r1, r1, #(4 - \shift) 1533 | 1534 | tst r0, #4 1535 | push {r4-r6} 1536 | movne r3, ip, lsr #(\shift * 8) 1537 | ldrne ip, [r1], #4 1538 | subne r2, r2, #4 1539 | orrne r3, r3, ip, asl #(32 - \shift * 8) 1540 | strne r3, [r0], #4 1541 | 1542 | tst r0, #8 1543 | movne r3, ip, lsr #(\shift * 8) 1544 | ldmiane r1!, {r4, ip} 1545 | subne r2, r2, #8 1546 | orrne r3, r3, r4, asl #(32 - \shift * 8) 1547 | movne r4, r4, lsr #(\shift * 8) 1548 | orrne r4, r4, ip, asl #(32 - \shift * 8) 1549 | stmiane r0!, {r3-r4} 1550 | 1551 | .if \write_align == 32 1552 | .if \custom_write_align == 1 1553 | eor r3, r1, r0 1554 | tst r0, #16 1555 | tstne r3, #16 1556 | .else 1557 | tst r0, #16 1558 | .endif 1559 | movne r3, ip, lsr #(\shift * 8) 1560 | beq 25f 1561 | ldmia r1!, {r4-r6, ip} 1562 | sub r2, r2, #16 1563 | orr r3, r3, r4, asl #(32 - \shift * 8) 1564 | mov r4, r4, lsr #(\shift * 8) 1565 | .if (68 - (\write_align - 1)) < 32 1566 | cmp r2, #32 1567 | .endif 1568 | orr r4, r4, r5, asl #(32 - \shift * 8) 1569 | mov r5, r5, lsr #(\shift * 8) 1570 | orr r5, r5, r6, asl #(32 - \shift * 8) 1571 | mov r6, r6, lsr #(\shift * 8) 1572 | orr r6, r6, ip, asl #(32 - \shift * 8) 1573 | stmia r0!, {r3-r6} 1574 | .if (68 - (\write_align - 1)) < 32 1575 | blt 22f 1576 | b 26f 1577 | .endif 1578 | 25: 1579 | .endif 1580 | 1581 | /* 1582 | * We don't need a check if the number of bytes left is guaranteed to 1583 | * be >= 32. 1584 | */ 1585 | .if (68 - (\write_align - 1)) < 32 1586 | cmp r2, #32 1587 | blt 22f 1588 | .endif 1589 | 26: 1590 | .if \write_align == \line_size && 0 1591 | push {r7-r11} 1592 | mov r9, #\preload_offset 1593 | sub r2, r2, #32 1594 | .else 1595 | add r3, r1, #\preload_offset 1596 | push {r7-r11} 1597 | bic r3, r3, #(\line_size - 1) 1598 | sub r2, r2, #32 1599 | sub r9, r3, r1 1600 | .endif 1601 | /* 1602 | * Main loop for unaligned copy. Process 32 bytes at a time. 1603 | */ 1604 | 21: 1605 | pld [r1, r9] 1606 | mov r3, ip, lsr #(\shift * 8) 1607 | ldmia r1!, {r4-r6, r7, r8, r10, r11, ip} 1608 | orr r3, r3, r4, asl #(32 - \shift * 8) 1609 | mov r4, r4, lsr #(\shift * 8) 1610 | subs r2, r2, #32 1611 | orr r4, r4, r5, asl #(32 - \shift * 8) 1612 | mov r5, r5, lsr #(\shift * 8) 1613 | orr r5, r5, r6, asl #(32 - \shift * 8) 1614 | mov r6, r6, lsr #(\shift * 8) 1615 | orr r6, r6, r7, asl #(32 - \shift * 8) 1616 | mov r7, r7, lsr #(\shift * 8) 1617 | .if \block_write_size == 16 1618 | stmia r0!, {r3-r6} 1619 | .endif 1620 | orr r7, r7, r8, asl #(32 - \shift * 8) 1621 | mov r8, r8, lsr #(\shift * 8) 1622 | orr r8, r8, r10, asl #(32 - \shift * 8) 1623 | mov r10, r10, lsr #(\shift * 8) 1624 | orr r10, r10, r11, asl #(32 - \shift * 8) 1625 | mov r11, r11, lsr #(\shift * 8) 1626 | orr r11, r11, ip, asl #(32 - \shift * 8) 1627 | .if \block_write_size == 32 1628 | stmia r0!, {r3-r6, r7, r8, r10, r11} 1629 | .endif 1630 | .if \block_write_size == 16 1631 | stmia r0!, {r7, r8, r10, r11} 1632 | .endif 1633 | bge 21b 1634 | adds r2, r2, #32 1635 | pop {r7-r11} 1636 | popeq {r4-r6} 1637 | popeq {r0} 1638 | bxeq lr 1639 | 22: 1640 | pop {r4-r6} 1641 | 23: 1642 | subs r2, r2, #4 1643 | movge r3, ip, lsr #(\shift * 8) 1644 | ldrge ip, [r1], #4 1645 | orrge r3, r3, ip, asl #(32 - \shift * 8) 1646 | strge r3, [r0], #4 1647 | bgt 23b 1648 | 1649 | 24: 1650 | sub r1, r1, #(4 - \shift) 1651 | .endm 1652 | 1653 | 1654 | .macro MEMCPY_VARIANT_SIMPLE granularity, line_size, write_align, block_write_size, \ 1655 | preload_offset, preload_catch_up, preload_early, overfetch, custom_write_align, \ 1656 | check_small_size_alignment 1657 | cmp r2, #68 1658 | .if \preload_early == 1 1659 | bic r3, r1, #(\line_size - 1) 1660 | .endif 1661 | mov ip, r0 1662 | .if \preload_early == 1 1663 | pld [r3] 1664 | .endif 1665 | bge 1f 1666 | 1667 | /* 1668 | * Path for sizes < 68 bytes; don't care about unaligned access 1669 | * except if both the source and destination are unaligned and 1670 | * the number of bytes is > 32. This checks costs a few percent 1671 | * performance for the common word aligned-case. 1672 | */ 1673 | .if \check_small_size_alignment == 1 1674 | .if \granularity <= 2 1675 | /* This assumes lt flag is set. */ 1676 | tst r0, #3 1677 | tstne r1, #3 1678 | cmpne r2, #32 1679 | bgt 2f 1680 | .endif 1681 | .endif 1682 | 3: 1683 | tst r2, #4 1684 | ldrne r3, [r1], #4 1685 | subne r2, r2, #4 1686 | strne r3, [r0], #4 1687 | 4: 1688 | cmp r2, #8 1689 | ldrge r3, [r1], #4 1690 | strge r3, [r0], #4 1691 | ldrge r3, [r1], #4 1692 | subge r2, r2, #8 1693 | strge r3, [r0], #4 1694 | bgt 4b 1695 | .if \granularity <= 2 1696 | tstne r2, #3 1697 | moveq r0, ip 1698 | bxeq lr 1699 | .endif 1700 | .if \granularity <= 2 1701 | tst r2, #2 1702 | ldrhne r3, [r1], #2 1703 | strhne r3, [r0], #2 1704 | .endif 1705 | .if \granularity == 1 1706 | tst r2, #1 1707 | ldrbne r3, [r1] 1708 | strbne r3, [r0] 1709 | .endif 1710 | mov r0, ip 1711 | bx lr 1712 | 1713 | .if \check_small_size_alignment == 1 1714 | .if \granularity <= 2 1715 | 2: 1716 | /* Align the destination. */ 1717 | .if \granularity == 1 1718 | tst r0, #1 1719 | .if \preload_early == 1 && \line_size == 32 1720 | pld [r3, #32] 1721 | .endif 1722 | ldrbne r3, [r1], #1 1723 | subne r2, r2, #1 1724 | strbne r3, [r0], #1 1725 | .endif 1726 | 1727 | tst r0, #2 1728 | .if \granularity == 2 && \preload_early == 1 && \line_size == 32 1729 | pld [r3, #32] 1730 | .endif 1731 | ldrbne r3, [r1], #1 1732 | ldrbne ip, [r1], #1 1733 | subne r2, r2, #2 1734 | orrne r3, r3, ip, asl #8 1735 | strhne r3, [r0], #2 1736 | b 3b 1737 | .endif 1738 | .endif 1739 | 1740 | /* Aligning this branch target to a 16-byte boundary helps performance a bit. */ 1741 | .p2align 4 1742 | 1: 1743 | /* Check that both destination and source are word aligned. */ 1744 | .if \granularity <= 2 1745 | tst r0, #3 1746 | .endif 1747 | push {r0} 1748 | .if \granularity == 1 1749 | tsteq r1, #3 1750 | .endif 1751 | .if \preload_early == 1 1752 | pld [r3, #\line_size] 1753 | .endif 1754 | push {r3} 1755 | .if \granularity <= 2 1756 | bne 3f 1757 | .endif 1758 | 1759 | /* Larger sizes with word aligned source and destination. */ 1760 | 2: 1761 | .if \custom_write_align == 1 1762 | MEMCPY_VARIANT_SIMPLE_WRITE_ALIGN_CUSTOM 1763 | .else 1764 | MEMCPY_VARIANT_SIMPLE_WRITE_ALIGN \write_align 1765 | .endif 1766 | /* 1767 | * We don't need a check if the number of bytes left is guaranteed to 1768 | * be >= line_size. 1769 | */ 1770 | .if (68 - (\write_align - 1)) >= \line_size 1771 | pop {r3} 1772 | .if \write_align == \line_size && 0 1773 | mov ip, #\preload_offset 1774 | .if \preload_early == 1 1775 | pld [r3, #(\line_size * 2)] 1776 | .endif 1777 | push {r4-r11} 1778 | .else 1779 | add ip, r1, #\preload_offset 1780 | .if \preload_early == 1 1781 | pld [r3, #(\line_size * 2)] 1782 | .endif 1783 | push {r4-r11} 1784 | bic ip, ip, #(\line_size - 1) 1785 | sub ip, ip, r1 1786 | .endif 1787 | .else 1788 | cmp r2, #\line_size 1789 | .if \write_align == \line_size && 0 1790 | pop {r3} 1791 | mov ip, #\preload_offset 1792 | .if \preload_early == 1 1793 | pld [r3, #(\line_size * 2)] 1794 | .endif 1795 | pushge {r4-r11} 1796 | blt 6f 1797 | .else 1798 | pop {r3} 1799 | addge ip, r1, #\preload_offset 1800 | .if \preload_early == 1 1801 | pld [r3, #(\line_size * 2)] 1802 | .endif 1803 | pushge {r4-r11} 1804 | blt 6f 1805 | bic ip, ip, #(\line_size - 1) 1806 | sub ip, ip, r1 1807 | .endif 1808 | .endif 1809 | 1810 | .if \line_size == 32 1811 | .if \preload_early == 1 && \preload_offset >= 96 1812 | .if \preload_catch_up == 1 1813 | add r4, r1, ip 1814 | add r3, r3, #(\line_size * 3) 1815 | cmp r3, r4 1816 | addlt r3, r3, #\line_size 1817 | bge 12f 1818 | 11: 1819 | cmp r3, r4 1820 | pld [r3, #-\line_size] 1821 | add r3, r3, #\line_size 1822 | blt 11b 1823 | 12: 1824 | .else 1825 | pld [r3, #(\line_size * 3)] 1826 | .endif 1827 | .endif 1828 | sub r2, r2, #32 1829 | 5: 1830 | /* 1831 | * The main loop for large sizes. Copy 32 bytes at a time 1832 | * using ldmia/stmia while prefetching a 32-byte aligned 1833 | * address. 1834 | */ 1835 | pld [r1, ip] 1836 | .if \block_write_size == 32 1837 | ldmia r1!, {r4-r11} 1838 | subs r2, r2, #32 1839 | stmia r0!, {r4-r11} 1840 | .else 1841 | ldmia r1!, {r4-r7} 1842 | subs r2, r2, #32 1843 | ldmia r1!, {r8-r11} 1844 | stmia r0!, {r4-r7} 1845 | stmia r0!, {r8-r11} 1846 | .endif 1847 | bge 5b 1848 | adds r2, r2, #32 1849 | pop {r4-r11} 1850 | popeq {r0} 1851 | bxeq lr 1852 | .endif 1853 | 1854 | .if \line_size == 64 1855 | .if \preload_early == 1 && \preload_offset >= 128 1856 | .if \preload_catch_up == 1 1857 | add r4, r1, ip 1858 | add r3, r3, #(\line_size * 3) 1859 | cmp r3, r4 1860 | addlt r3, r3, #\line_size 1861 | bge 12f 1862 | 11: 1863 | cmp r3, r4 1864 | pld [r3, #-\line_size] 1865 | add r3, r3, #\line_size 1866 | blt 11b 1867 | 12: 1868 | .else 1869 | pld [r3, #(\line_size * 3)] 1870 | .endif 1871 | .endif 1872 | sub r2, r2, #64 1873 | /* Aligning the main loop branch target seems to help performance a bit. */ 1874 | b 5f 1875 | .p2align 4 1876 | 5: 1877 | /* 1878 | * The main loop for large sizes. Copy 64 bytes at a time 1879 | * using ldmia/stmia while prefetching a 64-byte aligned 1880 | * address. 1881 | */ 1882 | pld [r1, ip] 1883 | ldmia r1!, {r4-r11} 1884 | subs r2, r2, #64 1885 | stmia r0!, {r4-r11} 1886 | ldmia r1!, {r4-r11} 1887 | stmia r0!, {r4-r11} 1888 | bge 5b 1889 | adds r2, r2, #64 1890 | pop {r4-r11} 1891 | popeq {r0} 1892 | bxeq lr 1893 | .endif 1894 | 1895 | 6: 1896 | .if \line_size == 64 1897 | cmp r2, #32 1898 | ldmiage r1!, {r3, ip} 1899 | blt 10f 1900 | stmia r0!, {r3, ip} 1901 | ldmia r1!, {r3, ip} 1902 | stmia r0!, {r3, ip} 1903 | ldmia r1!, {r3, ip} 1904 | stmia r0!, {r3, ip} 1905 | ldmia r1!, {r3, ip} 1906 | sub r2, r2, #32 1907 | stmia r0!, {r3, ip} 1908 | popeq {r0} 1909 | bxeq lr 1910 | 10: 1911 | .endif 1912 | 1913 | cmp r2, #16 1914 | ldrge r3, [r1] 1915 | ldrge ip, [r1, #4] 1916 | blt 7f 1917 | sub r2, r2, #16 1918 | str r3, [r0] 1919 | str ip, [r0, #4] 1920 | ldr r3, [r1, #8] 1921 | ldr ip, [r1, #12] 1922 | add r1, r1, #16 1923 | str r3, [r0, #8] 1924 | str ip, [r0, #12] 1925 | popeq {r0} 1926 | bxeq lr 1927 | add r0, r0, #16 1928 | 7: 1929 | cmp r2, #8 1930 | ldrge ip, [r1] 1931 | ldrge r3, [r1, #4] 1932 | strge ip, [r0], #4 1933 | pop {ip} 1934 | strge r3, [r0], #4 1935 | moveq r0, ip 1936 | bxeq lr 1937 | addge r1, r1, #8 1938 | 1939 | tst r2, #4 1940 | ldrne r3, [r1], #4 1941 | strne r3, [r0], #4 1942 | tst r2, #3 1943 | moveq r0, ip 1944 | bxeq lr 1945 | .if \granularity <= 2 1946 | tst r2, #2 1947 | ldrhne r3, [r1], #2 1948 | strhne r3, [r0], #2 1949 | .endif 1950 | .if \granularity == 1 1951 | tst r2, #1 1952 | ldrbne r3, [r1] 1953 | strbne r3, [r0] 1954 | .endif 1955 | mov r0, ip 1956 | bx lr 1957 | 1958 | .if \granularity <= 2 1959 | 3: 1960 | /* 1961 | * Copy data until destination address is 4 bytes aligned. 1962 | */ 1963 | .if \granularity == 1 1964 | tst r0, #1 1965 | ldrbne r3, [r1], #1 1966 | subne r2, r2, #1 1967 | strbne r3, [r0], #1 1968 | .endif 1969 | 1970 | tst r0, #2 1971 | ldrbne r3, [r1], #1 1972 | ldrbne ip, [r1], #1 1973 | subne r2, r2, #2 1974 | orrne r3, r3, ip, asl #8 1975 | strhne r3, [r0], #2 1976 | /* destination address is 4 bytes aligned */ 1977 | 1978 | tst r1, #3 1979 | popne {r3} 1980 | beq 2b 1981 | 1982 | /* Unaligned copy. */ 1983 | .if \granularity == 1 1984 | tst r1, #1 1985 | .endif 1986 | .if \preload_early == 1 1987 | pld [r3, #(\line_size * 2)] 1988 | .endif 1989 | .if \granularity == 1 1990 | bne 2f 1991 | .endif 1992 | 1993 | MEMCPY_VARIANT_SIMPLE_UNALIGNED_MAIN_PART 2, line_size, write_align, block_write_size, preload_offset, custom_write_align 1994 | 4: 1995 | tst r2, #2 1996 | ldrbne r3, [r1], #1 1997 | ldrbne ip, [r1], #1 1998 | strbne r3, [r0], #1 1999 | strbne ip, [r0], #1 2000 | 2001 | .if \granularity == 1 2002 | tst r2, #1 2003 | mov ip, r0 2004 | ldrbne r3, [r1] 2005 | ldr r0, [sp], #4 2006 | strbne r3, [ip] 2007 | .else 2008 | pop {r0} 2009 | .endif 2010 | bx lr 2011 | 2012 | .if \granularity == 1 2013 | 3: 2014 | MEMCPY_VARIANT_SIMPLE_UNALIGNED_MAIN_PART 3, line_size, write_align, block_write_size, preload_offset, custom_write_align 2015 | b 4b 2016 | 2017 | 2: 2018 | tst r1, #2 2019 | bne 3b 2020 | 2021 | MEMCPY_VARIANT_SIMPLE_UNALIGNED_MAIN_PART 1, line_size, write_align, block_write_size, preload_offset, custom_write_align 2022 | b 4b 2023 | .endif 2024 | 2025 | .endif 2026 | 2027 | .endm 2028 | 2029 | /* 2030 | * Settings for the MEMCPY_VARIANT_SIMPLE macro 2031 | * 2032 | * granularity 2033 | * Must be 1, 2 or 4. This value is 1 for normal memcpy, 2 for operations on half-word 2034 | * aligned regions such as 16bpp framebuffers/images, and 4 for operations on word aligned 2035 | * regions such as 32bpp framebuffers\images. 2036 | * line_size 2037 | * Must be 32 or 64. Defines the cache line size used for preloads. Preloads are only done 2038 | * at line_size aligned addresses. When early preload is enabled, the current implementation 2039 | * results in more aggressive early preload in the case of a line size of 64. 2040 | * write_align 2041 | * Must be 16, 32, or 64. Defines the write alignment that is applied just before the main loop 2042 | * for larger sizes. The main loop processes chunks of line_size bytes at a time. 2043 | * block_write_size 2044 | * Must must be 32, 16. Defines the size of multiple-register load and store instructions that 2045 | * are used in the main loop for larger sizes. 2046 | * preload_offset 2047 | * Must be a multiple of line_size. Defines the offset from the current source address at which 2048 | * preloads are performed (look-ahead) in the main loop. The real applied offset is derived before 2049 | * the start of the main loop by adding the preload offset to the source address and rounding 2050 | * down the result to a line_size boundary, and then substracting the source address. 2051 | * preload_catch_up 2052 | * Must be 0 or 1. When early preload is enabled, this enables code just before the main loop 2053 | * that performs a series of preloads from just beyond the last early preload to just before 2054 | * the first preload in the main loop, filling in the gap. 2055 | * preload_early 2056 | * Must be 0 or 1. When enabled, preload instructions are enabled early in the memcpy function 2057 | * to preload the initial part of the source memory region. Early preloads start at the source 2058 | * address aligned to a line_size boundary and end at that address + line_size * 2 (three 2059 | * early preloads in total). 2060 | * overfetch 2061 | * Must be 1. 2062 | * custom_write_align 2063 | * Must be 0 or 1. Enables RPi-specific write alignment whereby 32-byte alignment is only applied 2064 | * if the source address will be located after alignment in the second half of a 32-byte aligned 2065 | * chunk; if not, write alignment remains at 16 bytes. 2066 | * check_small_size_alignment 2067 | * Must be 0 or 1. For small sizes less than 68 bytes, unaligned memory access is used to reduce 2068 | * overhead in improve performance. However, when both source and destination are unaligned 2069 | * this induce a performance penalty. When this option is enabled, beyond a certain size threshold 2070 | * (currently set at 32 bytes), the destination is aligned to a word boundary. This may speed up 2071 | * unaligned copies in the range of 33 to 67 bytes. 2072 | * 2073 | * Restrictions: 2074 | * If line_size is 64, write_align must be 32 or 64, block_write_size must be 32, preload_offset 2075 | * must be a multiple of 64. 2076 | * If preload_catch_up is 1 then preload_early must be 1. 2077 | */ 2078 | 2079 | 2080 | #if defined(MEMCPY_REPLACEMENT_SUNXI) || defined(MEMCPY_REPLACEMENT_RPI) 2081 | 2082 | #ifdef MEMCPY_REPLACEMENT_SUNXI 2083 | 2084 | /* memcpy replacement for the Allwinner platform. */ 2085 | 2086 | asm_function memcpy 2087 | MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 192, 0, 1, 1, 0, 0 2088 | .endfunc 2089 | 2090 | #endif 2091 | 2092 | #ifdef MEMCPY_REPLACEMENT_RPI 2093 | 2094 | /* memcpy replacement for the RPi platform. */ 2095 | 2096 | asm_function memcpy 2097 | MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 128, 1, 1, 1, 1, 1 2098 | .endfunc 2099 | 2100 | #endif 2101 | 2102 | #else 2103 | 2104 | #ifdef RPI_BEST_MEMCPY_ONLY 2105 | 2106 | /* Optimized memcpy variants for the RPi platform . */ 2107 | 2108 | asm_function memcpy_armv5te_no_overfetch 2109 | MEMCPY_VARIANT 1, 32, 16, 16, 96, 1, 0 2110 | .endfunc 2111 | 2112 | asm_function memcpy_armv5te_overfetch 2113 | MEMCPY_VARIANT 1, 32, 16, 16, 128, 1, 1 2114 | .endfunc 2115 | 2116 | asm_function memcpy_halfwords_armv5te_no_overfetch 2117 | MEMCPY_VARIANT 2, 32, 16, 16, 96, 1, 0 2118 | .endfunc 2119 | 2120 | asm_function memcpy_halfwords_armv5te_overfetch 2121 | MEMCPY_VARIANT 2, 32, 16, 16, 128, 1, 1 2122 | .endfunc 2123 | 2124 | asm_function memcpy_words_armv5te_no_overfetch 2125 | MEMCPY_VARIANT 4, 32, 16, 16, 96, 1, 0 2126 | .endfunc 2127 | 2128 | asm_function memcpy_words_armv5te_overfetch 2129 | MEMCPY_VARIANT 4, 32, 16, 16, 128, 1, 1 2130 | .endfunc 2131 | 2132 | #else 2133 | 2134 | /* A large set of memcpy variants, used in the benchmark program */ 2135 | 2136 | asm_function memcpy_armv5te_no_overfetch_align_16_block_write_8_preload_96 2137 | MEMCPY_VARIANT 1, 32, 16, 8, 96, 0, 0 2138 | .endfunc 2139 | 2140 | asm_function memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_96 2141 | MEMCPY_VARIANT 1, 32, 16, 16, 96, 0, 0 2142 | .endfunc 2143 | 2144 | asm_function memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_96 2145 | MEMCPY_VARIANT 1, 32, 16, 16, 96, 1, 0 2146 | .endfunc 2147 | 2148 | asm_function memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_128 2149 | MEMCPY_VARIANT 1, 32, 16, 16, 128, 1, 0 2150 | .endfunc 2151 | 2152 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_8_preload_96 2153 | MEMCPY_VARIANT 1, 32, 32, 8, 96, 0, 0 2154 | .endfunc 2155 | 2156 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_64 2157 | MEMCPY_VARIANT 1, 32, 32, 16, 64, 0, 0 2158 | .endfunc 2159 | 2160 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_96 2161 | MEMCPY_VARIANT 1, 32, 32, 16, 96, 0, 0 2162 | .endfunc 2163 | 2164 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_128 2165 | MEMCPY_VARIANT 1, 32, 32, 16, 128, 0, 0 2166 | .endfunc 2167 | 2168 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_160 2169 | MEMCPY_VARIANT 1, 32, 32, 16, 160, 0, 0 2170 | .endfunc 2171 | 2172 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_192 2173 | MEMCPY_VARIANT 1, 32, 32, 16, 192, 0, 0 2174 | .endfunc 2175 | 2176 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_256 2177 | MEMCPY_VARIANT 1, 32, 32, 16, 256, 0, 0 2178 | .endfunc 2179 | 2180 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_64 2181 | MEMCPY_VARIANT 1, 32, 32, 32, 64, 0, 0 2182 | .endfunc 2183 | 2184 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_96 2185 | MEMCPY_VARIANT 1, 32, 32, 32, 96, 0, 0 2186 | .endfunc 2187 | 2188 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_128 2189 | MEMCPY_VARIANT 1, 32, 32, 32, 128, 0, 0 2190 | .endfunc 2191 | 2192 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_160 2193 | MEMCPY_VARIANT 1, 32, 32, 32, 160, 0, 0 2194 | .endfunc 2195 | 2196 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_192 2197 | MEMCPY_VARIANT 1, 32, 32, 32, 192, 0, 0 2198 | .endfunc 2199 | 2200 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_256 2201 | MEMCPY_VARIANT 1, 32, 32, 32, 256, 0, 0 2202 | .endfunc 2203 | 2204 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_96 2205 | MEMCPY_VARIANT 1, 32, 32, 16, 96, 1, 0 2206 | .endfunc 2207 | 2208 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_128 2209 | MEMCPY_VARIANT 1, 32, 32, 16, 128, 1, 0 2210 | .endfunc 2211 | 2212 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_192 2213 | MEMCPY_VARIANT 1, 32, 32, 16, 192, 1, 0 2214 | .endfunc 2215 | 2216 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_256 2217 | MEMCPY_VARIANT 1, 32, 32, 16, 256, 1, 0 2218 | .endfunc 2219 | 2220 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_128 2221 | MEMCPY_VARIANT 1, 32, 32, 32, 128, 1, 0 2222 | .endfunc 2223 | 2224 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_192 2225 | MEMCPY_VARIANT 1, 32, 32, 32, 192, 1, 0 2226 | .endfunc 2227 | 2228 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_256 2229 | MEMCPY_VARIANT 1, 32, 32, 32, 256, 1, 0 2230 | .endfunc 2231 | 2232 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_16_no_preload 2233 | MEMCPY_VARIANT 1, 32, 32, 16, 0, 0, 0 2234 | .endfunc 2235 | 2236 | asm_function memcpy_armv5te_no_overfetch_align_32_block_write_32_no_preload 2237 | MEMCPY_VARIANT 1, 32, 32, 32, 0, 0, 0 2238 | .endfunc 2239 | 2240 | asm_function memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_128 2241 | MEMCPY_VARIANT 1, 64, 32, 32, 128, 1, 0 2242 | .endfunc 2243 | 2244 | asm_function memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_192 2245 | MEMCPY_VARIANT 1, 64, 32, 32, 192, 1, 0 2246 | .endfunc 2247 | 2248 | asm_function memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_256 2249 | MEMCPY_VARIANT 1, 64, 32, 32, 256, 1, 0 2250 | .endfunc 2251 | 2252 | asm_function memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_320 2253 | MEMCPY_VARIANT 1, 64, 32, 32, 320, 1, 0 2254 | .endfunc 2255 | 2256 | asm_function memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_192 2257 | MEMCPY_VARIANT 1, 64, 64, 32, 192, 1, 0 2258 | .endfunc 2259 | 2260 | asm_function memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_256 2261 | MEMCPY_VARIANT 1, 64, 64, 32, 256, 1, 0 2262 | .endfunc 2263 | 2264 | asm_function memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_320 2265 | MEMCPY_VARIANT 1, 64, 64, 32, 320, 1, 0 2266 | .endfunc 2267 | 2268 | /* Overfetching versions. */ 2269 | 2270 | asm_function memcpy_armv5te_overfetch_align_16_block_write_16_preload_early_128 2271 | MEMCPY_VARIANT 1, 32, 16, 16, 128, 1, 1 2272 | .endfunc 2273 | 2274 | asm_function memcpy_armv5te_overfetch_align_32_block_write_32_preload_early_192 2275 | MEMCPY_VARIANT 1, 32, 32, 32, 192, 1, 1 2276 | .endfunc 2277 | 2278 | asm_function memcpy_simple_sunxi_preload_early_192 2279 | MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 192, 1, 1, 1, 0, 0 2280 | .endfunc 2281 | 2282 | asm_function memcpy_simple_sunxi_preload_early_192_no_catch_up 2283 | MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 192, 0, 1, 1, 0, 0 2284 | .endfunc 2285 | 2286 | asm_function memcpy_simple_sunxi_preload_early_192_no_catch_up_check_small_size_alignment 2287 | MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 192, 0, 1, 1, 0, 1 2288 | .endfunc 2289 | 2290 | asm_function memcpy_simple_sunxi_preload_early_256 2291 | MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 256, 1, 1, 1, 0, 0 2292 | .endfunc 2293 | 2294 | asm_function memcpy_simple_sunxi_preload_early_256_no_catch_up 2295 | MEMCPY_VARIANT_SIMPLE 1, 64, 32, 32, 256, 0, 1, 1, 0, 0 2296 | .endfunc 2297 | 2298 | asm_function memcpy_simple_rpi_preload_early_96 2299 | MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 96, 1, 1, 1, 1, 1 2300 | .endfunc 2301 | 2302 | asm_function memcpy_simple_rpi_preload_early_96_no_catch_up 2303 | MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 96, 0, 1, 1, 1, 0 2304 | .endfunc 2305 | 2306 | asm_function memcpy_simple_rpi_preload_early_96_no_catch_up_check_small_size_alignment 2307 | MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 96, 0, 1, 1, 1, 1 2308 | .endfunc 2309 | 2310 | asm_function memcpy_simple_rpi_preload_early_128 2311 | MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 128, 1, 1, 1, 1, 1 2312 | .endfunc 2313 | 2314 | asm_function memcpy_simple_rpi_preload_early_128_no_catch_up 2315 | MEMCPY_VARIANT_SIMPLE 1, 32, 32, 16, 128, 0, 1, 1, 1, 1 2316 | .endfunc 2317 | 2318 | #endif 2319 | 2320 | #endif 2321 | 2322 | #endif 2323 | -------------------------------------------------------------------------------- /arm_asm.h: -------------------------------------------------------------------------------- 1 | 2 | extern void *memcpy_armv5te(void *dest, const void *src, size_t n); 3 | 4 | #ifdef RPI_BEST_MEMCPY_ONLY 5 | 6 | extern void *memcpy_armv5te_no_overfetch(void *dest, const void *src, size_t n); 7 | 8 | extern void *memcpy_armv5te_overfetch(void *dest, const void *src, size_t n); 9 | 10 | extern void *memcpy_halfwords_armv5te_no_overfetch(void *dest, const void *src, size_t n); 11 | 12 | extern void *memcpy_halfwords_armv5te_overfetch(void *dest, const void *src, size_t n); 13 | 14 | extern void *memcpy_words_armv5te_no_overfetch(void *dest, const void *src, size_t n); 15 | 16 | extern void *memcpy_words_armv5te_overfetch(void *dest, const void *src, size_t n); 17 | 18 | #else 19 | 20 | extern void *memcpy_armv5te_no_overfetch_align_16_block_write_8_preload_96(void *dest, 21 | const void *src, size_t n); 22 | 23 | extern void *memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_96(void *dest, 24 | const void *src, size_t n); 25 | 26 | extern void *memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_96(void *dest, 27 | const void *src, size_t n); 28 | 29 | extern void *memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_128(void *dest, 30 | const void *src, size_t n); 31 | 32 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_8_preload_96(void *dest, 33 | const void *src, size_t n); 34 | 35 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_64(void *dest, 36 | const void *src, size_t n); 37 | 38 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_96(void *dest, 39 | const void *src, size_t n); 40 | 41 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_128(void *dest, 42 | const void *src, size_t n); 43 | 44 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_160(void *dest, 45 | const void *src, size_t n); 46 | 47 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_192(void *dest, 48 | const void *src, size_t n); 49 | 50 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_256(void *dest, 51 | const void *src, size_t n); 52 | 53 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_64(void *dest, 54 | const void *src, size_t n); 55 | 56 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_96(void *dest, 57 | const void *src, size_t n); 58 | 59 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_128(void *dest, 60 | const void *src, size_t n); 61 | 62 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_160(void *dest, 63 | const void *src, size_t n); 64 | 65 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_192(void *dest, 66 | const void *src, size_t n); 67 | 68 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_256(void *dest, 69 | const void *src, size_t n); 70 | 71 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_96(void *dest, 72 | const void *src, size_t n); 73 | 74 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_128(void *dest, 75 | const void *src, size_t n); 76 | 77 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_192(void *dest, 78 | const void *src, size_t n); 79 | 80 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_256(void *dest, 81 | const void *src, size_t n); 82 | 83 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_128(void *dest, 84 | const void *src, size_t n); 85 | 86 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_192(void *dest, 87 | const void *src, size_t n); 88 | 89 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_256(void *dest, 90 | const void *src, size_t n); 91 | 92 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_16_no_preload(void *dest, 93 | const void *src, size_t n); 94 | 95 | extern void *memcpy_armv5te_no_overfetch_align_32_block_write_32_no_preload(void *dest, 96 | const void *src, size_t n); 97 | 98 | extern void *memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_128(void *dest, 99 | const void *src, size_t n); 100 | 101 | extern void *memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_192(void *dest, 102 | const void *src, size_t n); 103 | 104 | extern void *memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_256(void *dest, 105 | const void *src, size_t n); 106 | 107 | extern void *memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_320(void *dest, 108 | const void *src, size_t n); 109 | 110 | extern void *memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_192(void *dest, 111 | const void *src, size_t n); 112 | 113 | extern void *memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_256(void *dest, 114 | const void *src, size_t n); 115 | 116 | extern void *memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_320(void *dest, 117 | const void *src, size_t n); 118 | 119 | extern void *memcpy_armv5te_overfetch_align_16_block_write_16_preload_early_128(void *dest, 120 | const void *src, size_t n); 121 | 122 | extern void *memcpy_armv5te_overfetch_align_32_block_write_32_preload_early_192(void *dest, 123 | const void *src, size_t n); 124 | 125 | extern void *memcpy_simple_sunxi_preload_early_192(void *dest, const void *src, size_t n); 126 | extern void *memcpy_simple_sunxi_preload_early_192_no_catch_up(void *dest, const void *src, size_t n); 127 | extern void *memcpy_simple_sunxi_preload_early_192_no_catch_up_check_small_size_alignment(void *dest, const void *src, size_t n); 128 | extern void *memcpy_simple_sunxi_preload_early_256(void *dest, const void *src, size_t n); 129 | extern void *memcpy_simple_sunxi_preload_early_256_no_catch_up(void *dest, const void *src, size_t n); 130 | 131 | extern void *memcpy_simple_rpi_preload_early_96(void *dest, const void *src, size_t n); 132 | extern void *memcpy_simple_rpi_preload_early_96_no_catch_up(void *dest, const void *src, size_t n); 133 | extern void *memcpy_simple_rpi_preload_early_96_no_catch_up_check_small_size_alignment(void *dest, const void *src, size_t n); 134 | extern void *memcpy_simple_rpi_preload_early_128(void *dest, const void *src, size_t n); 135 | extern void *memcpy_simple_rpi_preload_early_128_no_catch_up(void *dest, const void *src, size_t n); 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /benchmark.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 Harm Hanemaaijer 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice (including the next 12 | * paragraph) shall be included in all copies or substantial portions of the 13 | * Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | * 23 | */ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "arm_asm.h" 36 | #include "new_arm.h" 37 | #ifdef INCLUDE_MEMCPY_HYBRID 38 | #include "memcpy-hybrid.h" 39 | #endif 40 | 41 | #define DEFAULT_TEST_DURATION 2.0 42 | #define RANDOM_BUFFER_SIZE 256 43 | 44 | #ifdef INCLUDE_LIBARMMEM_MEMCPY 45 | 46 | void *armmem_memcpy(void * restrict s1, const void * restrict s2, size_t n); 47 | 48 | #define LIBARMMEM_COUNT 1 49 | #else 50 | #define LIBARMMEM_COUNT 0 51 | #endif 52 | 53 | #ifdef INCLUDE_MEMCPY_HYBRID 54 | #define MEMCPY_HYBRID_COUNT 1 55 | #else 56 | #define MEMCPY_HYBRID_COUNT 0 57 | #endif 58 | 59 | #define NU_MEMCPY_VARIANTS (57 + LIBARMMEM_COUNT + MEMCPY_HYBRID_COUNT) 60 | #define NU_MEMSET_VARIANTS 5 61 | 62 | 63 | typedef void *(*memcpy_func_type)(void *dest, const void *src, size_t n); 64 | typedef void *(*memset_func_type)(void *dest, int c, size_t n); 65 | 66 | memcpy_func_type memcpy_func; 67 | memset_func_type memset_func; 68 | uint8_t *buffer_alloc, *buffer_chunk, *buffer_page, *buffer_compare; 69 | int *random_buffer_1024, *random_buffer_1M, *random_buffer_powers_of_two_up_to_4096_power_law; 70 | int *random_buffer_multiples_of_four_up_to_1024_power_law, *random_buffer_up_to_1023_power_law; 71 | double test_duration = DEFAULT_TEST_DURATION; 72 | int memcpy_mask[NU_MEMCPY_VARIANTS]; 73 | int memset_mask[NU_MEMSET_VARIANTS]; 74 | int test_alignment; 75 | 76 | static const char *memcpy_variant_name[NU_MEMCPY_VARIANTS] = { 77 | "standard memcpy", 78 | #ifdef INCLUDE_LIBARMMEM_MEMCPY 79 | "libarmmem memcpy", 80 | #endif 81 | #ifdef INCLUDE_MEMCPY_HYBRID 82 | "cortex-strings memcpy-hybrid (NEON)", 83 | #endif 84 | "armv5te memcpy", 85 | "new memcpy for cortex with line size of 32, preload offset of 192", 86 | "new memcpy for cortex with line size of 64, preload offset of 192", 87 | "new memcpy for cortex using NEON with line size 32, preload offset 192", 88 | "new memcpy for cortex using NEON with line size 64, preload offset 192", 89 | "new memcpy for cortex using NEON with line size 32, only early preload (relying on automatic prefetcher)", 90 | "new memcpy for sunxi with line size of 64, preload offset of 192 and write alignment of 32", 91 | "new memcpy for sunxi with line size of 64, preload offset of 192 and aligned access", 92 | "new memcpy for sunxi with line size of 32, preload offset of 192 and write alignment of 32", 93 | "new memcpy for rpi with preload offset of 96, write alignment of 8", 94 | "new memcpy for rpi with preload offset of 96, write alignment of 8 and aligned access", 95 | "simplified memcpy for sunxi with preload offset of 192, early preload and preload catch up", 96 | "simplified memcpy for sunxi with preload offset of 192, early preload and no preload catch up", 97 | "simplified memcpy for sunxi with preload offset of 192, early preload, no preload catch up and with small size alignment check", 98 | "simplified memcpy for sunxi with preload offset of 256, early preload and preload catch up", 99 | "simplified memcpy for sunxi with preload offset of 256, early preload and no preload catch up", 100 | "simplified memcpy for rpi with preload offset of 96, early preload and preload catch up", 101 | "simplified memcpy for rpi with preload offset of 96, early preload and no preload catch up", 102 | "simplified memcpy for rpi with preload offset of 96, early preload and no preload catch up and with small size alignment check", 103 | "simplified memcpy for rpi with preload offset of 128, early preload and preload catch up", 104 | "simplified memcpy for rpi with preload offset of 128, early preload and no preload catch up", 105 | "armv5te non-overfetching memcpy with write alignment of 16 and block write size of 8, preload offset 96", 106 | "armv5te non-overfetching memcpy with write alignment of 16 and block write size of 16, preload offset 96", 107 | "armv5te non-overfetching memcpy with write alignment of 16 and block write size of 16, preload offset 96 with early preload", 108 | "armv5te non-overfetching memcpy with write alignment of 16 and block write size of 16, preload offset 128 with early preload", 109 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 8, preload offset 96", 110 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 64", 111 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 96", 112 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 128", 113 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 160", 114 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 192", 115 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 256", 116 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 64", 117 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 96", 118 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 128", 119 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 160", 120 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 192", 121 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 256", 122 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 96 with early preload", 123 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 128 with early preload", 124 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 192 with early preload", 125 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, preload offset 256 with early preload", 126 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 128 with early preload", 127 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 192 with early preload", 128 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 256 with early preload", 129 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 16, no preload", 130 | "armv5te non-overfetching memcpy with write alignment of 32 and block write size of 32, no preload", 131 | "armv5te non-overfetching memcpy with line size of 64, write alignment of 32 and block write size of 32, preload offset 128 with early preload", 132 | "armv5te non-overfetching memcpy with line size of 64, write alignment of 32 and block write size of 32, preload offset 192 with early preload", 133 | "armv5te non-overfetching memcpy with line size of 64, write alignment of 32 and block write size of 32, preload offset 256 with early preload", 134 | "armv5te non-overfetching memcpy with line_size of 64, write alignment of 32 and block write size of 32, preload offset 320 with early preload", 135 | "armv5te non-overfetching memcpy with line size of 64, write alignment of 64 and block write size of 32, preload offset 192 with early preload", 136 | "armv5te non-overfetching memcpy with line size of 64, write alignment of 64 and block write size of 32, preload offset 256 with early preload", 137 | "armv5te non-overfetching memcpy with line_size of 64, write alignment of 64 and block write size of 32, preload offset 320 with early preload", 138 | "armv5te overfetching memcpy with write alignment of 16 and block write size of 16, preload offset 128 with early preload", 139 | "armv5te overfetching memcpy with write alignment of 32 and block write size of 32, preload offset 192 with early preload" 140 | }; 141 | 142 | static const memcpy_func_type memcpy_variant[NU_MEMCPY_VARIANTS] = { 143 | memcpy, 144 | #ifdef INCLUDE_LIBARMMEM_MEMCPY 145 | armmem_memcpy, 146 | #endif 147 | #ifdef INCLUDE_MEMCPY_HYBRID 148 | memcpy_hybrid, 149 | #endif 150 | memcpy_armv5te, 151 | memcpy_new_line_size_32_preload_192, 152 | memcpy_new_line_size_64_preload_192, 153 | memcpy_new_neon_line_size_32, 154 | memcpy_new_neon_line_size_64, 155 | memcpy_new_neon_line_size_32_auto, 156 | memcpy_new_line_size_64_preload_192_align_32, 157 | memcpy_new_line_size_64_preload_192_aligned_access, 158 | memcpy_new_line_size_32_preload_192_align_32, 159 | memcpy_new_line_size_32_preload_96, 160 | memcpy_new_line_size_32_preload_96_aligned_access, 161 | memcpy_simple_sunxi_preload_early_192, 162 | memcpy_simple_sunxi_preload_early_192_no_catch_up, 163 | memcpy_simple_sunxi_preload_early_192_no_catch_up_check_small_size_alignment, 164 | memcpy_simple_sunxi_preload_early_256, 165 | memcpy_simple_sunxi_preload_early_256_no_catch_up, 166 | memcpy_simple_rpi_preload_early_96, 167 | memcpy_simple_rpi_preload_early_96_no_catch_up, 168 | memcpy_simple_rpi_preload_early_96_no_catch_up_check_small_size_alignment, 169 | memcpy_simple_rpi_preload_early_128, 170 | memcpy_simple_rpi_preload_early_128_no_catch_up, 171 | memcpy_armv5te_no_overfetch_align_16_block_write_8_preload_96, 172 | memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_96, 173 | memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_96, 174 | memcpy_armv5te_no_overfetch_align_16_block_write_16_preload_early_128, 175 | memcpy_armv5te_no_overfetch_align_32_block_write_8_preload_96, 176 | memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_64, 177 | memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_96, 178 | memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_128, 179 | memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_160, 180 | memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_192, 181 | memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_256, 182 | memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_64, 183 | memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_96, 184 | memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_128, 185 | memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_160, 186 | memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_192, 187 | memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_256, 188 | memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_96, 189 | memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_128, 190 | memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_192, 191 | memcpy_armv5te_no_overfetch_align_32_block_write_16_preload_early_256, 192 | memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_128, 193 | memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_192, 194 | memcpy_armv5te_no_overfetch_align_32_block_write_32_preload_early_256, 195 | memcpy_armv5te_no_overfetch_align_32_block_write_16_no_preload, 196 | memcpy_armv5te_no_overfetch_align_32_block_write_32_no_preload, 197 | memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_128, 198 | memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_192, 199 | memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_256, 200 | memcpy_armv5te_no_overfetch_line_64_align_32_block_write_32_preload_early_320, 201 | memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_192, 202 | memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_256, 203 | memcpy_armv5te_no_overfetch_line_64_align_64_block_write_32_preload_early_320, 204 | memcpy_armv5te_overfetch_align_16_block_write_16_preload_early_128, 205 | memcpy_armv5te_overfetch_align_32_block_write_32_preload_early_192 206 | }; 207 | 208 | static const char *memset_variant_name[NU_MEMSET_VARIANTS] = { 209 | "libc memset", 210 | "optimized memset with write alignment of 0", 211 | "optimized memset with write alignment of 8", 212 | "optimized memset with write alignment of 32", 213 | "NEON memset", 214 | }; 215 | 216 | static const memset_func_type memset_variant[NU_MEMSET_VARIANTS] = { 217 | memset, 218 | memset_new_align_0, 219 | memset_new_align_8, 220 | memset_new_align_32, 221 | memset_neon 222 | }; 223 | 224 | static double get_time() { 225 | struct timespec ts; 226 | clock_gettime(CLOCK_REALTIME, &ts); 227 | return (double)ts.tv_sec + (double)ts.tv_nsec / 1000000000.0; 228 | } 229 | 230 | static void test_mixed_powers_of_two_word_aligned(int i) { 231 | memcpy_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 232 | buffer_page + random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 233 | random_buffer_powers_of_two_up_to_4096_power_law[i & (RANDOM_BUFFER_SIZE - 1)]); 234 | } 235 | 236 | static void test_mixed_power_law_word_aligned(int i) { 237 | memcpy_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 238 | buffer_page + random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 239 | random_buffer_multiples_of_four_up_to_1024_power_law[i & (RANDOM_BUFFER_SIZE - 1)]); 240 | } 241 | 242 | static void test_mixed_power_law_unaligned(int i) { 243 | memcpy_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 244 | buffer_page + random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)], 245 | random_buffer_up_to_1023_power_law[i & (RANDOM_BUFFER_SIZE - 1)]); 246 | } 247 | 248 | static void test_unaligned_random_3(int i) { 249 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 250 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)], 251 | 3); 252 | } 253 | 254 | static void test_unaligned_random_8(int i) { 255 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 256 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)], 257 | 8); 258 | } 259 | 260 | static void test_aligned_4(int i) { 261 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 262 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 263 | 4); 264 | } 265 | 266 | static void test_aligned_8(int i) { 267 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 268 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 269 | 8); 270 | } 271 | 272 | static void test_aligned_16(int i) { 273 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 274 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 275 | 16); 276 | } 277 | 278 | static void test_aligned_32(int i) { 279 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 280 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 281 | 32); 282 | } 283 | 284 | static void test_aligned_64(int i) { 285 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 286 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 287 | 64); 288 | } 289 | 290 | static void test_aligned_128(int i) { 291 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 292 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 293 | 128); 294 | } 295 | 296 | static void test_aligned_256(int i) { 297 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 298 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 299 | 256); 300 | } 301 | 302 | static void test_unaligned_random_17(int i) { 303 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 304 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)], 305 | 17); 306 | } 307 | 308 | static void test_unaligned_random_28(int i) { 309 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 310 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)], 311 | 28); 312 | } 313 | 314 | static void test_aligned_28(int i) { 315 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 316 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 317 | 28); 318 | } 319 | 320 | static void test_unaligned_random_64(int i) { 321 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 322 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)], 323 | 64); 324 | } 325 | 326 | static void test_unaligned_random_137(int i) { 327 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 328 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)], 329 | 137); 330 | } 331 | 332 | static void test_unaligned_random_1024(int i) { 333 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 334 | buffer_page + 8192 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)], 335 | 1024); 336 | } 337 | 338 | static void test_unaligned_random_32768(int i) { 339 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 340 | buffer_page + 65536 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)], 341 | 32768); 342 | } 343 | 344 | static void test_unaligned_random_1M(int i) { 345 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 346 | buffer_page + 2 * 1024 * 1024 + 347 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)], 348 | 1024 * 1024); 349 | } 350 | 351 | static void test_source_dest_aligned_random_64(int i) { 352 | memcpy_func(buffer_page + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)], 353 | buffer_page + 4096 + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)], 354 | 64); 355 | } 356 | 357 | static void test_source_dest_aligned_random_1024(int i) { 358 | memcpy_func(buffer_page + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)], 359 | buffer_page + 4096 + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)], 360 | 1024); 361 | } 362 | 363 | static void test_source_dest_aligned_random_32768(int i) { 364 | memcpy_func(buffer_page + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)], 365 | buffer_page + 65536 + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)], 366 | 32768); 367 | } 368 | 369 | static void test_source_dest_aligned_random_1M(int i) { 370 | memcpy_func(buffer_page + random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)], 371 | buffer_page + 2 * 1024 * 1024 + 372 | random_buffer_1024[i & (RANDOM_BUFFER_SIZE - 1)], 373 | 1024 * 1024); 374 | } 375 | 376 | static void test_word_aligned_28(int i) { 377 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 378 | buffer_page + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 379 | 28); 380 | } 381 | 382 | static void test_word_aligned_64(int i) { 383 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 384 | buffer_page + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 385 | 64); 386 | } 387 | 388 | static void test_word_aligned_296(int i) { 389 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 390 | buffer_page + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 391 | 296); 392 | } 393 | 394 | static void test_word_aligned_1024(int i) { 395 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 396 | buffer_page + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 397 | 1024); 398 | } 399 | 400 | static void test_word_aligned_4096(int i) { 401 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 402 | buffer_page + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 403 | 4096); 404 | } 405 | 406 | static void test_word_aligned_32768(int i) { 407 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 408 | buffer_page + 128 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4, 409 | 32768); 410 | } 411 | 412 | static void test_chunk_aligned_64(int i) { 413 | memcpy_func(buffer_chunk + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32, 414 | buffer_chunk + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 32, 415 | 64); 416 | } 417 | 418 | static void test_chunk_aligned_296(int i) { 419 | memcpy_func(buffer_chunk + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32, 420 | buffer_chunk + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 32, 421 | 296); 422 | } 423 | 424 | static void test_chunk_aligned_1024(int i) { 425 | memcpy_func(buffer_chunk + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32, 426 | buffer_chunk + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 32, 427 | 1024); 428 | } 429 | 430 | static void test_chunk_aligned_4096(int i) { 431 | memcpy_func(buffer_chunk + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32, 432 | buffer_chunk + 64 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 32, 433 | 4096); 434 | } 435 | 436 | static void test_chunk_aligned_32768(int i) { 437 | memcpy_func(buffer_chunk + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32, 438 | buffer_chunk + 128 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 32, 439 | 32768); 440 | } 441 | 442 | static void test_page_aligned_1024(int i) { 443 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 444 | buffer_page + 8192 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 445 | 1024); 446 | } 447 | 448 | static void test_page_aligned_4096(int i) { 449 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 450 | buffer_page + 8192 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 451 | 4096); 452 | } 453 | 454 | static void test_page_aligned_32768(int i) { 455 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 456 | buffer_page + 8192 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 457 | 32768); 458 | } 459 | 460 | static void test_page_aligned_256K(int i) { 461 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 462 | buffer_page + 8192 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 463 | 256 * 1024); 464 | } 465 | 466 | static void test_page_aligned_1M(int i) { 467 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 468 | buffer_page + 8192 * 1024 + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 469 | 1024 * 1024); 470 | } 471 | 472 | static void test_page_aligned_8M(int i) { 473 | memcpy_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 474 | buffer_page + 16384 * 1024 + random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 475 | 8 * 1024 * 1024); 476 | } 477 | 478 | static void test_random_mixed_sizes_1024(int i) { 479 | memcpy_func(buffer_page + random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)], 480 | buffer_page + 4096 + random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)], 481 | 1 + random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))]); 482 | } 483 | 484 | static void test_random_mixed_sizes_64(int i) { 485 | memcpy_func(buffer_page + random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)], 486 | buffer_page + 4096 + random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)], 487 | 1 + (random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))] & 63)); 488 | } 489 | 490 | static void test_random_mixed_sizes_DRAM_1024(int i) { 491 | /* Source and destination address selected randomly from range of 8MB. */ 492 | memcpy_func(buffer_page + 493 | // Select a random 8192 bytes aligned addres. 494 | 8192 * random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] + 495 | // Add a random offset up to (4096 - 256) in steps of 256 based on higher bits 496 | // of the iteration number. 497 | ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 + 498 | // Add a random offset up to 1023 in steps of 1 based on the lower end bits 499 | // of the iteration number. 500 | random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)], 501 | buffer_page + 502 | 8192 * random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] + 503 | ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 + 504 | random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)], 505 | 1 + random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))]); 506 | } 507 | 508 | static void test_random_mixed_sizes_DRAM_64(int i) { 509 | /* Source and destination address selected randomly from range of 8MB. */ 510 | memcpy_func(buffer_page + 511 | 8192 * random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] + 512 | ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 + 513 | random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)], 514 | buffer_page + 515 | 8192 * random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] + 516 | ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 + 517 | random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)], 518 | 1 + (random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))] & 63)); 519 | } 520 | 521 | static void test_random_mixed_sizes_DRAM_word_aligned_1024(int i) { 522 | /* Source and destination address selected randomly from range of 8MB. */ 523 | memcpy_func(buffer_page + 524 | // Select a random 8192 bytes aligned addres. 525 | 8192 * random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] + 526 | // Add a random offset up to (4096 - 256) in steps of 256 based on higher bits 527 | // of the iteration number. 528 | ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 + 529 | // Add a random offset up to 1020 in steps of 4 based on the lower end bits 530 | // of the iteration number. 531 | (random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)] & (~3)), 532 | buffer_page + 533 | 8192 * random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] + 534 | ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 + 535 | (random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)] & (~3)), 536 | 4 + (random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))] & (~3))); 537 | } 538 | 539 | static void test_random_mixed_sizes_DRAM_word_aligned_256(int i) { 540 | /* Source and destination address selected randomly from range of 8MB. */ 541 | memcpy_func(buffer_page + 542 | 8192 * random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] + 543 | ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 + 544 | (random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)] & (~3)), 545 | buffer_page + 546 | 8192 * random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] + 547 | ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 + 548 | (random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)] & (~3)), 549 | 4 + (random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))] & 252)); 550 | } 551 | 552 | static void test_random_mixed_sizes_DRAM_word_aligned_64(int i) { 553 | /* Source and destination address selected randomly from range of 8MB. */ 554 | memcpy_func(buffer_page + 555 | 8192 * random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] + 556 | ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 + 557 | (random_buffer_1024[(i * 4) & (RANDOM_BUFFER_SIZE - 1)] & (~3)), 558 | buffer_page + 559 | 8192 * random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] + 560 | ((i / (RANDOM_BUFFER_SIZE / 4)) & 15) * 256 + 561 | (random_buffer_1024[(i * 4 + 1) & (RANDOM_BUFFER_SIZE - 1)] & (~3)), 562 | 4 + (random_buffer_1024[((i * 4 + 2) & (RANDOM_BUFFER_SIZE - 1))] & 60)); 563 | } 564 | 565 | static void test_memset_page_aligned_1024(int i) { 566 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 567 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 1024); 568 | } 569 | 570 | static void test_memset_page_aligned_4096(int i) { 571 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4096, 572 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 4096); 573 | } 574 | 575 | static void test_memset_mixed_powers_of_two_word_aligned(int i) { 576 | memset_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 577 | random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 578 | random_buffer_powers_of_two_up_to_4096_power_law[i & (RANDOM_BUFFER_SIZE - 1)]); 579 | } 580 | 581 | static void test_memset_mixed_power_law_word_aligned(int i) { 582 | memset_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 583 | random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 584 | random_buffer_multiples_of_four_up_to_1024_power_law[i & (RANDOM_BUFFER_SIZE - 1)]); 585 | } 586 | 587 | static void test_memset_mixed_power_law_unaligned(int i) { 588 | memset_func(buffer_page + random_buffer_1M[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 589 | random_buffer_1M[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 590 | random_buffer_up_to_1023_power_law[i & (RANDOM_BUFFER_SIZE - 1)]); 591 | } 592 | 593 | static void test_memset_aligned_4(int i) { 594 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 595 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 596 | 4); 597 | } 598 | 599 | static void test_memset_aligned_8(int i) { 600 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 601 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 602 | 8); 603 | } 604 | 605 | static void test_memset_aligned_16(int i) { 606 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 607 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 608 | 16); 609 | } 610 | 611 | static void test_memset_aligned_28(int i) { 612 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 613 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 614 | 28); 615 | } 616 | 617 | static void test_memset_aligned_32(int i) { 618 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 619 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 620 | 32); 621 | } 622 | 623 | static void test_memset_aligned_64(int i) { 624 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 625 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 626 | 64); 627 | } 628 | 629 | static void test_memset_various_aligned_64(int i) { 630 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 32 + test_alignment, 631 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 632 | 64); 633 | } 634 | 635 | static void test_memset_aligned_80(int i) { 636 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 637 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 638 | 80); 639 | } 640 | 641 | static void test_memset_aligned_92(int i) { 642 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 643 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 644 | 92); 645 | } 646 | 647 | static void test_memset_aligned_128(int i) { 648 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 649 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 650 | 128); 651 | } 652 | 653 | static void test_memset_aligned_256(int i) { 654 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)] * 4, 655 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 656 | 256); 657 | } 658 | 659 | static void test_memset_unaligned_random_3(int i) { 660 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 661 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 662 | 3); 663 | } 664 | 665 | static void test_memset_unaligned_random_8(int i) { 666 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 667 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 668 | 8); 669 | } 670 | 671 | static void test_memset_unaligned_random_17(int i) { 672 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 673 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 674 | 17); 675 | } 676 | 677 | static void test_memset_unaligned_random_28(int i) { 678 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 679 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 680 | 28); 681 | } 682 | 683 | static void test_memset_unaligned_random_64(int i) { 684 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 685 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 686 | 64); 687 | } 688 | 689 | static void test_memset_unaligned_random_137(int i) { 690 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 691 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 692 | 137); 693 | } 694 | 695 | static void test_memset_unaligned_random_1023(int i) { 696 | memset_func(buffer_page + random_buffer_1024[(i * 2) & (RANDOM_BUFFER_SIZE - 1)], 697 | random_buffer_1024[(i * 2 + 1) & (RANDOM_BUFFER_SIZE - 1)] & 0xFF, 698 | 1023); 699 | } 700 | 701 | static void clear_data_cache() { 702 | int val = 0; 703 | for (int i = 0; i < 1024 * 1024 * 32; i += 4) { 704 | val += buffer_alloc[i]; 705 | } 706 | for (int i = 0; i < 1024 * 1024 * 32; i += 4) { 707 | buffer_alloc[i] = val; 708 | } 709 | } 710 | 711 | static void do_test(const char *name, void (*test_func)(int), int bytes) { 712 | int nu_iterations; 713 | if (bytes >= 1024) 714 | nu_iterations = (64 * 1024 * 1024) / bytes; 715 | else if (bytes >= 64) 716 | nu_iterations = (16 * 1024 * 1024) / bytes; 717 | else 718 | nu_iterations = 1024 * 1024 / 2; 719 | /* Warm-up. */ 720 | clear_data_cache(); 721 | double temp_time = get_time(); 722 | for (int i = 0; i < nu_iterations; i++) 723 | test_func(i); 724 | usleep(100000); 725 | double start_time = get_time(); 726 | double end_time; 727 | int count = 0; 728 | for (;;) { 729 | for (int i = 0; i < nu_iterations; i++) 730 | test_func(i); 731 | count++; 732 | end_time = get_time(); 733 | if (end_time - start_time >= test_duration) 734 | break; 735 | } 736 | double bandwidth = (double)bytes * nu_iterations * count / (1024 * 1024) 737 | / (end_time - start_time); 738 | printf("%s: %.2lf MB/s\n", name, bandwidth); 739 | } 740 | 741 | static void do_test_all(const char *name, void (*test_func)(), int bytes) { 742 | for (int j = 0; j < NU_MEMCPY_VARIANTS; j++) 743 | if (memcpy_mask[j]) { 744 | printf("%s:\n", memcpy_variant_name[j]); 745 | memcpy_func = memcpy_variant[j]; 746 | do_test(name, test_func, bytes); 747 | } 748 | } 749 | 750 | static void fill_buffer(uint8_t *buffer) { 751 | uint32_t v = 0xEEAAEEAA; 752 | for (int i = 0; i < 1024 * 1024 * 16; i++) { 753 | buffer[i] = (v >> 24); 754 | v += i ^ 0x12345678; 755 | } 756 | } 757 | 758 | static int compare_buffers(uint8_t *buffer0, uint8_t *buffer1) { 759 | int identical = 1; 760 | int count = 0; 761 | for (int i = 0; i < 1024 * 1024 * 16; i++) { 762 | if (buffer0[i] != buffer1[i]) { 763 | count++; 764 | if (count < 10) { 765 | printf("Byte at offset %d (0x%08X) doesn't match.\n", 766 | i, i); 767 | identical = 0; 768 | } 769 | } 770 | } 771 | if (count >= 10) { 772 | printf("(%d more non-matching bytes present.)\n", count - 9); 773 | } 774 | return identical; 775 | } 776 | 777 | static void memcpy_emulate(uint8_t *dest, uint8_t *src, int size) { 778 | for (int i = 0; i < size; i++) 779 | dest[i] = src[i]; 780 | } 781 | 782 | static void do_validation(int repeat) { 783 | int passed = 1; 784 | for (int i = 0; i < 10 * repeat; i++) { 785 | int size, source, dest; 786 | size = floor(pow(2.0, (double)rand() * 20.0 / RAND_MAX)); 787 | source = rand() % (1024 * 1024 * 16 + 1 - size); 788 | int aligned = 0; 789 | if ((rand() & 3) == 0) { 790 | aligned = 1; 791 | source &= ~3; 792 | size = (size + 3) & (~3); 793 | } 794 | do { 795 | dest = rand() % (1024 * 1024 * 16 + 1 - size); 796 | if (aligned) 797 | dest &= ~3; 798 | } 799 | while (dest + size > source && dest < source + size); 800 | printf("Testing (source offset = 0x%08X, destination offset = 0x%08X, size = %d).\n", 801 | source, dest, size); 802 | fflush(stdout); 803 | fill_buffer(buffer_compare); 804 | memcpy_emulate(buffer_compare + dest, buffer_compare + source, size); 805 | fill_buffer(buffer_page); 806 | if (memcpy_func(buffer_page + dest, buffer_page + source, size) != buffer_page + dest) { 807 | printf("Validation failed: function did not return original destination address.\n"); 808 | passed = 0; 809 | } 810 | if (!compare_buffers(buffer_page, buffer_compare)) { 811 | printf("Validation failed (source offset = 0x%08X, destination offset = 0x%08X, size = %d).\n", 812 | source, dest, size); 813 | passed = 0; 814 | } 815 | } 816 | if (passed) { 817 | printf("Passed.\n"); 818 | } 819 | } 820 | 821 | static void memset_emulate(uint8_t *dest, int c, int size) { 822 | for (int i = 0; i < size; i++) 823 | dest[i] = c; 824 | } 825 | 826 | static void do_validation_memset(int repeat) { 827 | int passed = 1; 828 | for (int i = 0; i < 10 * repeat; i++) { 829 | int size, dest, c; 830 | size = floor(pow(2.0, (double)rand() * 20.0 / RAND_MAX)); 831 | dest = rand() % (1024 * 1024 * 16 + 1 - size); 832 | c = rand() & 0xFF; 833 | printf("Testing (destination offset = 0x%08X, byte = %d, size = %d).\n", 834 | dest, c, size); 835 | fflush(stdout); 836 | fill_buffer(buffer_compare); 837 | memset_emulate(buffer_compare + dest, c, size); 838 | fill_buffer(buffer_page); 839 | if (memset_func(buffer_page + dest, c, size) != buffer_page + dest) { 840 | printf("Validation failed: function did not return original destination address.\n"); 841 | passed = 0; 842 | } 843 | if (!compare_buffers(buffer_page, buffer_compare)) { 844 | printf("Validation failed (destination offset = 0x%08X, size = %d).\n", 845 | dest, size); 846 | passed = 0; 847 | } 848 | } 849 | if (passed) { 850 | printf("Passed.\n"); 851 | } 852 | } 853 | 854 | #define NU_TESTS 48 855 | 856 | typedef struct { 857 | const char *name; 858 | void (*test_func)(); 859 | int bytes; 860 | } test_t; 861 | 862 | static test_t test[NU_TESTS] = { 863 | { "Mixed powers of 2 from 4 to 4096 (power law), word aligned", test_mixed_powers_of_two_word_aligned, 32768 }, 864 | { "Mixed multiples of 4 from 4 to 1024 (power law), word aligned", test_mixed_power_law_word_aligned, 32768 }, 865 | { "Mixed from 1 to 1023 (power law), unaligned", test_mixed_power_law_unaligned, 32768 }, 866 | { "4 bytes word aligned", test_aligned_4, 4 }, 867 | { "8 bytes word aligned", test_aligned_8, 8 }, 868 | { "16 bytes word aligned", test_aligned_16, 16 }, 869 | { "28 bytes word aligned", test_aligned_28, 28 }, 870 | { "32 bytes word aligned", test_aligned_32, 32 }, 871 | { "64 bytes word aligned", test_aligned_64, 64 }, 872 | { "128 bytes word aligned", test_aligned_128, 128 }, 873 | { "256 bytes word aligned", test_aligned_256, 256 }, 874 | { "3 bytes randomly aligned", test_unaligned_random_3, 3 }, 875 | { "8 bytes randomly aligned", test_unaligned_random_8, 8 }, 876 | { "17 bytes randomly aligned", test_unaligned_random_17, 17 }, 877 | { "28 bytes randomly aligned", test_unaligned_random_28, 28 }, 878 | { "64 bytes randomly aligned", test_unaligned_random_64, 64 }, 879 | { "137 bytes randomly aligned", test_unaligned_random_137, 137 }, 880 | { "1024 bytes randomly aligned", test_unaligned_random_1024, 1024 }, 881 | { "32768 bytes randomly aligned", test_unaligned_random_32768, 32768 }, 882 | { "1M bytes randomly aligned", test_unaligned_random_1M, 1024 * 1024 }, 883 | { "64 bytes randomly aligned, source aligned with dest", 884 | test_source_dest_aligned_random_64, 64 }, 885 | { "1024 bytes randomly aligned, source aligned with dest", 886 | test_source_dest_aligned_random_1024, 1024 }, 887 | { "32768 bytes randomly aligned, source aligned with dest", 888 | test_source_dest_aligned_random_32768, 32768 }, 889 | { "1M bytes randomly aligned, source aligned with dest", 890 | test_source_dest_aligned_random_1M, 1024 *1024 }, 891 | { "Up to 1024 bytes randomly aligned", test_random_mixed_sizes_1024, 512 }, 892 | { "Up to 64 bytes randomly aligned", test_random_mixed_sizes_64, 32 }, 893 | { "Up to 1024 bytes randomly aligned (DRAM)", test_random_mixed_sizes_DRAM_1024, 894 | 512 }, 895 | { "Up to 64 bytes randomly aligned (DRAM)", test_random_mixed_sizes_DRAM_64, 896 | 32 }, 897 | { "Up to 1024 bytes word aligned (DRAM)", test_random_mixed_sizes_DRAM_word_aligned_1024, 898 | 514 }, 899 | { "Up to 256 bytes word aligned (DRAM)", test_random_mixed_sizes_DRAM_word_aligned_256, 900 | 130 }, 901 | { "Up to 64 bytes word aligned (DRAM)", test_random_mixed_sizes_DRAM_word_aligned_64, 902 | 34 }, 903 | { "28 bytes 4-byte aligned", test_word_aligned_28, 28 }, 904 | { "64 bytes 4-byte aligned", test_word_aligned_64, 64 }, 905 | { "296 bytes 4-byte aligned", test_word_aligned_296, 296 }, 906 | { "1024 bytes 4-byte aligned", test_word_aligned_1024, 1024 }, 907 | { "4096 bytes 4-byte aligned", test_word_aligned_4096, 4096 }, 908 | { "32768 bytes 4-byte aligned", test_word_aligned_32768, 32768 }, 909 | { "64 bytes 32-byte aligned", test_chunk_aligned_64, 64 }, 910 | { "296 bytes 32-byte aligned", test_chunk_aligned_296, 296 }, 911 | { "1024 bytes 32-byte aligned", test_chunk_aligned_1024, 1024 }, 912 | { "4096 bytes 32-byte aligned", test_chunk_aligned_4096, 4096 }, 913 | { "32768 bytes 32-byte aligned", test_chunk_aligned_32768, 32768 }, 914 | { "1024 bytes page aligned", test_page_aligned_1024, 1024 }, 915 | { "4096 bytes page aligned", test_page_aligned_4096, 4096 }, 916 | { "32768 bytes page aligned", test_page_aligned_32768, 32768 }, 917 | { "256K bytes page aligned", test_page_aligned_256K, 256 * 1024 }, 918 | { "1M bytes page aligned", test_page_aligned_1M, 1024 * 1024 }, 919 | { "8M bytes page aligned", test_page_aligned_8M, 8 * 1024 * 1024 }, 920 | }; 921 | 922 | #define NU_MEMSET_TESTS 23 923 | 924 | static test_t memset_test[NU_MEMSET_TESTS] = { 925 | { "Mixed powers of 2 from 4 to 4096 (power law), word aligned", test_memset_mixed_powers_of_two_word_aligned, 2048 }, 926 | { "Mixed multiples of 4 from 4 to 1024 (power law), word aligned", test_memset_mixed_power_law_word_aligned, 512 }, 927 | { "Mixed from 1 to 1023 (power law), unaligned", test_memset_mixed_power_law_unaligned, 512 }, 928 | { "1024 bytes page aligned", test_memset_page_aligned_1024, 1024 }, 929 | { "4096 bytes page aligned", test_memset_page_aligned_4096, 4096 }, 930 | { "4 bytes word aligned", test_memset_aligned_4, 4 }, 931 | { "8 bytes word aligned", test_memset_aligned_8, 8 }, 932 | { "16 bytes word aligned", test_memset_aligned_16, 16 }, 933 | { "28 bytes word aligned", test_memset_aligned_28, 28 }, 934 | { "32 bytes word aligned", test_memset_aligned_32, 32 }, 935 | { "64 bytes word aligned", test_memset_aligned_64, 64 }, 936 | { "64 bytes various alignments word aligned (multi-test)", test_memset_various_aligned_64, 64 }, 937 | { "80 bytes word aligned", test_memset_aligned_80, 80 }, 938 | { "92 bytes word aligned", test_memset_aligned_92, 92 }, 939 | { "128 bytes word aligned", test_memset_aligned_128, 128 }, 940 | { "256 bytes word aligned", test_memset_aligned_256, 256 }, 941 | { "3 bytes randomly aligned", test_memset_unaligned_random_3, 3 }, 942 | { "8 bytes randomly aligned", test_memset_unaligned_random_8, 8 }, 943 | { "17 bytes randomly aligned", test_memset_unaligned_random_17, 17 }, 944 | { "28 bytes randomly aligned", test_memset_unaligned_random_28, 28 }, 945 | { "64 bytes randomly aligned", test_memset_unaligned_random_64, 64 }, 946 | { "137 bytes randomly aligned", test_memset_unaligned_random_137, 137 }, 947 | { "1023 bytes randomly aligned", test_memset_unaligned_random_1023, 1023 }, 948 | }; 949 | 950 | static void usage() { 951 | printf("Commands:\n" 952 | "--list List test numbers and memcpy variants.\n" 953 | "--test Perform test only, 5 times for each memcpy variant.\n" 954 | "--all Perform each test 5 times for each memcpy variant.\n" 955 | "--help Show this message.\n" 956 | "Options:\n" 957 | "--duration Sets the duration of each individual test. Default is 2 seconds.\n" 958 | "--repeat Repeat each test n times. Default is 5.\n" 959 | "--quick Shorthand for --duration 1 -repeat 2.\n" 960 | "--memcpy Instead of testing all memcpy variants, test only the memcpy variants\n" 961 | " in . is a string of characters from a to h or higher, corresponding\n" 962 | " to each memcpy variant (for example, abcdef selects the first six variants).\n" 963 | "--validate Validate for correctness instead of measuring performance. The --repeat option\n" 964 | " can be used to influence the number of validation tests performed (default 5).\n" 965 | ); 966 | } 967 | 968 | static int char_to_memcpy_variant(char c) { 969 | if (c >= 'a' && c <= 'z') 970 | return c - 'a'; 971 | if (c >= 'A' && c <= 'Z') 972 | return c - 'A' + 26; 973 | return - 1; 974 | } 975 | 976 | static char memcpy_variant_to_char(int i) { 977 | if (i < 26) 978 | return 'a' + i; 979 | return 'A' + i - 26; 980 | } 981 | 982 | int main(int argc, char *argv[]) { 983 | if (argc == 1) { 984 | usage(); 985 | return 0; 986 | } 987 | int argi = 1; 988 | int command_test = - 1; 989 | int command_all = 0; 990 | int repeat = 5; 991 | int validate = 0; 992 | int memcpy_specified = 0; 993 | int memset_specified = 0; 994 | for (int i = 0; i < NU_MEMCPY_VARIANTS; i++) 995 | memcpy_mask[i] = 0; 996 | for (int i = 0; i < NU_MEMSET_VARIANTS; i++) 997 | memset_mask[i] = 0; 998 | for (;;) { 999 | if (argi >= argc) 1000 | break; 1001 | if (argi + 1 < argc && strcasecmp(argv[argi], "--test") == 0) { 1002 | int t = atoi(argv[argi + 1]); 1003 | if (t < 0 || t >= NU_TESTS) { 1004 | printf("Test out of range.\n"); 1005 | return 1; 1006 | } 1007 | command_test = t; 1008 | argi += 2; 1009 | continue; 1010 | } 1011 | if (strcasecmp(argv[argi], "--quick") == 0) { 1012 | test_duration = 1.0; 1013 | repeat = 2; 1014 | argi++; 1015 | continue; 1016 | } 1017 | if (strcasecmp(argv[argi], "--all") == 0) { 1018 | command_all = 1; 1019 | argi++; 1020 | continue; 1021 | } 1022 | if (strcasecmp(argv[argi], "--list") == 0) { 1023 | printf("Tests (memcpy):\n"); 1024 | for (int i = 0; i < NU_TESTS; i++) 1025 | printf("%3d %s\n", i, test[i].name); 1026 | printf("Tests (memset):\n"); 1027 | for (int i = 0; i < NU_MEMSET_TESTS; i++) 1028 | printf("%3d %s\n", i, memset_test[i].name); 1029 | printf("memcpy variants:\n"); 1030 | for (int i = 0; i < NU_MEMCPY_VARIANTS; i++) 1031 | printf(" %c %s\n", memcpy_variant_to_char(i), memcpy_variant_name[i]); 1032 | printf("memset variants:\n"); 1033 | for (int i = 0; i < NU_MEMSET_VARIANTS; i++) 1034 | printf(" %c %s\n", memcpy_variant_to_char(i), memset_variant_name[i]); 1035 | return 0; 1036 | } 1037 | if (strcasecmp(argv[argi], "--help") == 0) { 1038 | usage(); 1039 | return 0; 1040 | } 1041 | if (argi + 1 < argc && strcasecmp(argv[argi], "--duration") == 0) { 1042 | double d = strtod(argv[argi + 1], NULL); 1043 | if (d < 0.1 || d >= 100.0) { 1044 | printf("Duration out of range.\n"); 1045 | return 1; 1046 | } 1047 | test_duration = d; 1048 | argi += 2; 1049 | continue; 1050 | } 1051 | if (argi + 1 < argc && strcasecmp(argv[argi], "--memcpy") == 0) { 1052 | for (int i = 0; i < NU_MEMCPY_VARIANTS; i++) 1053 | memcpy_mask[i] = 0; 1054 | for (int i = 0; i < strlen(argv[argi + 1]); i++) 1055 | if (char_to_memcpy_variant(argv[argi + 1][i]) >= 0 && char_to_memcpy_variant(argv[argi + 1][i]) < NU_MEMCPY_VARIANTS) 1056 | memcpy_mask[char_to_memcpy_variant(argv[argi + 1][i])] = 1; 1057 | memcpy_specified = 1; 1058 | argi += 2; 1059 | continue; 1060 | } 1061 | if (argi + 1 < argc && strcasecmp(argv[argi], "--repeat") == 0) { 1062 | repeat = atoi(argv[argi + 1]); 1063 | if (repeat < 1 || repeat >= 1000) { 1064 | printf("Number of repeats out of range.\n"); 1065 | return 1; 1066 | } 1067 | argi += 2; 1068 | continue; 1069 | } 1070 | if (strcasecmp(argv[argi], "--validate") == 0) { 1071 | validate = 1; 1072 | argi++; 1073 | continue; 1074 | } 1075 | if (argi + 1 < argc && strcasecmp(argv[argi], "--memset") == 0) { 1076 | for (int i = 0; i < NU_MEMSET_VARIANTS; i++) 1077 | memset_mask[i] = 0; 1078 | for (int i = 0; i < strlen(argv[argi + 1]); i++) 1079 | if (char_to_memcpy_variant(argv[argi + 1][i]) >= 0 && char_to_memcpy_variant(argv[argi + 1][i]) < NU_MEMSET_VARIANTS) 1080 | memset_mask[char_to_memcpy_variant(argv[argi + 1][i])] = 1; 1081 | memset_specified = 1; 1082 | argi += 2; 1083 | continue; 1084 | } 1085 | printf("Unkown option. Try --help.\n"); 1086 | return 1; 1087 | } 1088 | 1089 | if (memcpy_specified && memset_specified) { 1090 | printf("Specify only one of --memcpy and --memset.\n"); 1091 | return 1; 1092 | } 1093 | 1094 | if (command_test != -1 && memset_specified && 1095 | command_test >= NU_MEMSET_TESTS) { 1096 | printf("Test out of range for memset.\n"); 1097 | return 1; 1098 | } 1099 | 1100 | if ((command_test != -1) + command_all != 1 && !validate) { 1101 | printf("Specify only one of --test and --all.\n"); 1102 | return 1; 1103 | } 1104 | 1105 | buffer_alloc = malloc(1024 * 1024 * 32); 1106 | buffer_page = (uint8_t *)buffer_alloc + ((4096 - ((uintptr_t)buffer_alloc & 4095)) 1107 | & 4095); 1108 | buffer_chunk = buffer_page + 17 * 32; 1109 | if (validate) 1110 | buffer_compare = malloc(1024 * 1024 * 16); 1111 | srand(0); 1112 | random_buffer_1024 = malloc(sizeof(int) * RANDOM_BUFFER_SIZE); 1113 | for (int i = 0; i < RANDOM_BUFFER_SIZE; i++) 1114 | random_buffer_1024[i] = rand() % 1024; 1115 | random_buffer_1M = malloc(sizeof(int) * RANDOM_BUFFER_SIZE); 1116 | for (int i = 0; i < RANDOM_BUFFER_SIZE; i++) 1117 | random_buffer_1M[i] = rand() % (1024 * 1024); 1118 | random_buffer_powers_of_two_up_to_4096_power_law = malloc(sizeof(int) * RANDOM_BUFFER_SIZE); 1119 | int random_buffer_powers_of_two_up_to_4096_power_law_total_bytes = 0; 1120 | for (int i = 0; i < RANDOM_BUFFER_SIZE; i++) { 1121 | int size = 4 << (int)floor(11.0 * pow(1.5, 10.0 * (double)rand() / RAND_MAX) / pow(1.5, 10.0)); 1122 | random_buffer_powers_of_two_up_to_4096_power_law[i] = size; 1123 | random_buffer_powers_of_two_up_to_4096_power_law_total_bytes += size; 1124 | } 1125 | test[0].bytes = random_buffer_powers_of_two_up_to_4096_power_law_total_bytes / RANDOM_BUFFER_SIZE; 1126 | memset_test[0].bytes = test[0].bytes; 1127 | random_buffer_multiples_of_four_up_to_1024_power_law = malloc(sizeof(int) * RANDOM_BUFFER_SIZE); 1128 | int random_buffer_multiples_of_four_up_to_1024_power_law_total_bytes = 0; 1129 | for (int i = 0; i < RANDOM_BUFFER_SIZE; i++) { 1130 | double f = (double)rand() / RAND_MAX; 1131 | int size; 1132 | if (f < 0.9) 1133 | /* 90% in the range 4 to 256. */ 1134 | size = 4 + ((int)floor(252.0 * 1135 | (pow(1.0 + f / 0.9, 5.0) - 1.0) / (pow(2.0, 5.0) - 1.0) 1136 | ) & (~3)); 1137 | else 1138 | /* 10% in the range 260 to 1024 */ 1139 | size = 4 + ((int)floor((1024 - 260.0) * 1140 | (pow(1.0 + (f - 0.9) / 0.1, 8.0) - 1.0) / (pow(2.0, 8.0) - 1.0) 1141 | ) & (~3)); 1142 | random_buffer_multiples_of_four_up_to_1024_power_law[i] = size; 1143 | random_buffer_multiples_of_four_up_to_1024_power_law_total_bytes += size; 1144 | } 1145 | test[1].bytes = random_buffer_multiples_of_four_up_to_1024_power_law_total_bytes / RANDOM_BUFFER_SIZE; 1146 | memset_test[1].bytes = test[1].bytes; 1147 | random_buffer_up_to_1023_power_law = malloc(sizeof(int) * RANDOM_BUFFER_SIZE); 1148 | int random_buffer_up_to_1023_power_law_total_bytes = 0; 1149 | for (int i = 0; i < RANDOM_BUFFER_SIZE; i++) { 1150 | int size; 1151 | size = 1 + (int)floor(1024.0 * (pow(2.0, 10.0 * (double)rand() / RAND_MAX) - 1.0) / (pow(2.0, 10.0) - 1.0)); 1152 | random_buffer_up_to_1023_power_law[i] = size; 1153 | random_buffer_up_to_1023_power_law_total_bytes += size; 1154 | } 1155 | test[2].bytes = random_buffer_up_to_1023_power_law_total_bytes / RANDOM_BUFFER_SIZE; 1156 | memset_test[2].bytes = test[2].bytes; 1157 | 1158 | if (sizeof(size_t) != sizeof(int)) { 1159 | printf("sizeof(size_t) != sizeof(int), unable to directly replace memcpy.\n"); 1160 | return 1; 1161 | } 1162 | 1163 | int start_test, end_test; 1164 | start_test = 0; 1165 | if (memset_specified) 1166 | end_test = NU_MEMSET_TESTS - 1; 1167 | else 1168 | end_test = NU_TESTS - 1; 1169 | if (command_test != - 1) { 1170 | start_test = command_test; 1171 | end_test = command_test; 1172 | } 1173 | if (validate) { 1174 | for (int j = 0; j < NU_MEMCPY_VARIANTS; j++) 1175 | if (memcpy_mask[j]) { 1176 | printf("%s:\n", memcpy_variant_name[j]); 1177 | memcpy_func = memcpy_variant[j]; 1178 | do_validation(repeat); 1179 | } 1180 | for (int j = 0; j < NU_MEMSET_VARIANTS; j++) 1181 | if (memset_mask[j]) { 1182 | printf("%s:\n", memset_variant_name[j]); 1183 | memset_func = memset_variant[j]; 1184 | do_validation_memset(repeat); 1185 | } 1186 | return 0; 1187 | } 1188 | if (!memcpy_specified) 1189 | goto skip_memcpy_test; 1190 | for (int t = start_test; t <= end_test; t++) { 1191 | for (int j = 0; j < NU_MEMCPY_VARIANTS; j++) 1192 | if (memcpy_mask[j]) { 1193 | printf("%s:\n", memcpy_variant_name[j]); 1194 | memcpy_func = memcpy_variant[j]; 1195 | for (int i = 0; i < repeat; i++) 1196 | do_test(test[t].name, test[t].test_func, test[t].bytes); 1197 | } 1198 | } 1199 | skip_memcpy_test: 1200 | if (!memset_specified) 1201 | goto skip_memset_test; 1202 | for (int t = start_test; t <= end_test; t++) { 1203 | if (t == 11) { 1204 | for (test_alignment = 0; test_alignment < 32; test_alignment += 4) { 1205 | char test_name[128]; 1206 | sprintf(test_name, "%s (alignment %d)", memset_test[t].name, 1207 | test_alignment); 1208 | for (int j = 0; j < NU_MEMSET_VARIANTS; j++) 1209 | if (memset_mask[j]) { 1210 | printf("%s:\n", memset_variant_name[j]); 1211 | memset_func = memset_variant[j]; 1212 | for (int i = 0; i < repeat; i++) 1213 | do_test(test_name, memset_test[t].test_func, memset_test[t].bytes); 1214 | } 1215 | } 1216 | continue; 1217 | } 1218 | for (int j = 0; j < NU_MEMSET_VARIANTS; j++) 1219 | if (memset_mask[j]) { 1220 | printf("%s:\n", memset_variant_name[j]); 1221 | memset_func = memset_variant[j]; 1222 | for (int i = 0; i < repeat; i++) 1223 | do_test(memset_test[t].name, memset_test[t].test_func, memset_test[t].bytes); 1224 | } 1225 | } 1226 | skip_memset_test: 1227 | exit(0); 1228 | } 1229 | -------------------------------------------------------------------------------- /memcpy-hybrid.S: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2010-2011, Linaro Limited 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | * Neither the name of Linaro Limited nor the names of its 16 | contributors may be used to endorse or promote products derived 17 | from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | Written by Dave Gilbert 32 | 33 | This memcpy routine is optimised on a Cortex-A9 and should work on 34 | all ARMv7 processors with NEON. */ 35 | 36 | /* Modified: 37 | * Change preload offset to 192. 38 | */ 39 | #define PRELOAD_OFFSET 192 40 | 41 | @ 2011-09-01 david.gilbert@linaro.org 42 | @ Extracted from local git 2f11b436 43 | 44 | .syntax unified 45 | .arch armv7-a 46 | .fpu neon 47 | 48 | @ this lets us check a flag in a 00/ff byte easily in either endianness 49 | #ifdef __ARMEB__ 50 | #define CHARTSTMASK(c) 1<<(31-(c*8)) 51 | #else 52 | #define CHARTSTMASK(c) 1<<(c*8) 53 | #endif 54 | .text 55 | .thumb 56 | 57 | @ --------------------------------------------------------------------------- 58 | .thumb_func 59 | .align 2 60 | .p2align 4,,15 61 | .global memcpy_hybrid 62 | .type memcpy_hybrid,%function 63 | memcpy_hybrid: 64 | @ r0 = dest 65 | @ r1 = source 66 | @ r2 = count 67 | @ returns dest in r0 68 | @ Overlaps of source/dest not allowed according to spec 69 | @ Note this routine relies on v7 misaligned loads/stores 70 | pld [r1] 71 | mov r12, r0 @ stash original r0 72 | cmp r2,#32 73 | blt 10f @ take the small copy case separately 74 | 75 | @ test for either source or destination being misaligned 76 | @ (We only rely on word align) 77 | tst r0,#3 78 | it eq 79 | tsteq r1,#3 80 | bne 30f @ misaligned case 81 | 82 | 4: 83 | @ at this point we are word (or better) aligned and have at least 84 | @ 32 bytes to play with 85 | 86 | @ If it's a huge copy, try Neon 87 | cmp r2, #128*1024 88 | bge 35f @ Sharing general non-aligned case here, aligned could be faster 89 | 90 | push {r3,r4,r5,r6,r7,r8,r10,r11} 91 | 5: 92 | ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11} 93 | sub r2,r2,#32 94 | pld [r1,#PRELOAD_OFFSET] 95 | cmp r2,#32 96 | stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11} 97 | bge 5b 98 | 99 | pop {r3,r4,r5,r6,r7,r8,r10,r11} 100 | @ We are now down to less than 32 bytes 101 | cbz r2,15f @ quick exit for the case where we copied a multiple of 32 102 | 103 | 10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes) 104 | cmp r2,#4 105 | blt 12f 106 | 11: 107 | sub r2,r2,#4 108 | cmp r2,#4 109 | ldr r3, [r1],#4 110 | str r3, [r0],#4 111 | bge 11b 112 | 12: 113 | tst r2,#2 114 | itt ne 115 | ldrhne r3, [r1],#2 116 | strhne r3, [r0],#2 117 | 118 | tst r2,#1 119 | itt ne 120 | ldrbne r3, [r1],#1 121 | strbne r3, [r0],#1 122 | 123 | 15: @ exit 124 | mov r0,r12 @ restore r0 125 | bx lr 126 | 127 | .align 2 128 | .p2align 4,,15 129 | 30: @ non-aligned - at least 32 bytes to play with 130 | @ Test for co-misalignment 131 | eor r3, r0, r1 132 | tst r3,#3 133 | beq 50f 134 | 135 | @ Use Neon for misaligned 136 | 35: 137 | vld1.8 {d0,d1,d2,d3}, [r1]! 138 | sub r2,r2,#32 139 | cmp r2,#32 140 | pld [r1,#PRELOAD_OFFSET] 141 | vst1.8 {d0,d1,d2,d3}, [r0]! 142 | bge 35b 143 | b 10b @ TODO: Probably a bad idea to switch to ARM at this point 144 | 145 | .align 2 146 | .p2align 4,,15 147 | 50: @ Co-misaligned 148 | @ At this point we've got at least 32 bytes 149 | 51: 150 | ldrb r3,[r1],#1 151 | sub r2,r2,#1 152 | strb r3,[r0],#1 153 | tst r0,#7 154 | bne 51b 155 | 156 | cmp r2,#32 157 | blt 10b 158 | b 4b 159 | -------------------------------------------------------------------------------- /memcpy-hybrid.h: -------------------------------------------------------------------------------- 1 | 2 | extern void *memcpy_hybrid(void *dest, const void *src, size_t n); 3 | -------------------------------------------------------------------------------- /new_arm.S: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Harm Hanemaaijer 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice (including the next 12 | * paragraph) shall be included in all copies or substantial portions of the 13 | * Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | * 23 | */ 24 | 25 | #ifdef CONFIG_THUMB 26 | #define W(instr) instr.w 27 | #define THUMB(instr...) instr 28 | #define ARM(instr...) 29 | #else 30 | #define W(instr) instr 31 | #define THUMB(instr...) 32 | #define ARM(instr...) instr 33 | #endif 34 | 35 | /* 36 | * In practice, because the way NEON is configured on most systems, 37 | * specifying alignment hints for NEON instructions doesn't seem 38 | * to improve performance, or even degrade performance in some cases. 39 | * However, actually having the address aligned to an element 40 | * boundary or greater is beneficial. 41 | */ 42 | #define NEON_ALIGN(n) 43 | /* #define NEON_ALIGN(n) :n */ 44 | 45 | /* Prevent the stack from becoming executable */ 46 | #if defined(__linux__) && defined(__ELF__) 47 | .section .note.GNU-stack,"",%progbits 48 | #endif 49 | 50 | .text 51 | .syntax unified 52 | .arch armv7a 53 | .fpu neon 54 | 55 | .macro asm_function function_name 56 | .global \function_name 57 | .func \function_name 58 | .type \function_name, function 59 | ARM( .p2align 5 ) 60 | THUMB( .p2align 2 ) 61 | \function_name: 62 | .endm 63 | 64 | /* 65 | * The following memcpy implementation is optimized with a fast path 66 | * for common, word aligned cases and optionally use unaligned access for 67 | * small sizes. 68 | * 69 | * - line_size is the cache line size used for prefetches. Must be 64 or 32. 70 | * - prefetch_distance is the number of cache lines to look ahead and must be 71 | * >= 2. 72 | * - write_align is the write alignment enforced before the main loop for larger 73 | * sizes (word aligned case) and must be 0, 16, 32, or 64. 74 | * - aligned_access must be 0 or 1. When enabled, no unaligned memory accesses 75 | * will occur. Both small size tresholds for unaligned access are not used 76 | * in this case. 77 | */ 78 | 79 | /* The threshold size for using the fast path for the word-aligned case. */ 80 | #define FAST_PATH_THRESHOLD 256 81 | /* The threshold size for using the small size path for the word-aligned case. */ 82 | #define SMALL_SIZE_THRESHOLD 15 83 | /* 84 | * The threshold size for using the small size path for the unaligned case. 85 | * Unaligned memory accesses will be generated for requests smaller or equal to 86 | * this size. 87 | */ 88 | #define UNALIGNED_SMALL_SIZE_THRESHOLD 64 89 | /* 90 | * The threshold size for using the small size path when both the source and 91 | * the destination are unaligned. Unaligned memory accesses will be generated 92 | * for requests smaller of equal to this size. 93 | */ 94 | #define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 32 95 | 96 | /* 97 | * For a code-reduced version, define all four of the above constants to 0, 98 | * eliminating the fast path and small size special cases. With Thumb2 99 | * enabled, this resulted in a reduction in code size from 1150 to 824 bytes, 100 | * at the cost of lower performance for smaller sizes. 101 | */ 102 | // #define FAST_PATH_THRESHOLD 0 103 | // #define SMALL_SIZE_THRESHOLD 0 104 | // #define UNALIGNED_SMALL_SIZE_THRESHOLD 0 105 | // #define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 0 106 | 107 | /* 108 | * EARLY_PREFETCHES is used in the fast path implementation. 109 | * The optimal value for EARLY_PREFETCHES was determined empirically. 110 | * It is equal to prefetch_distance + 1 for line_size 32. 111 | * and prefetch_distance - 1 for line_size 64. 112 | */ 113 | #define EARLY_PREFETCHES (\prefetch_distance - (\line_size / 32) * 2 + 3) 114 | 115 | #if FAST_PATH_THRESHOLD > 0 116 | #define FAST_PATH(instr...) instr 117 | #define NO_FAST_PATH(instr...) 118 | #else 119 | #define FAST_PATH(instr...) 120 | #define NO_FAST_PATH(instr...) instr 121 | #endif 122 | 123 | 124 | /* Helper macro for the fast-path implementation. */ 125 | 126 | .macro copy_16_bytes bytes_to_go, line_size, prefetch_distance 127 | #ifdef CONFIG_THUMB 128 | /* 129 | * When Thumb2 mode is enabled, the ldmia/stmia instructions 130 | * will be 16-bit, and the preload instruction will be 131 | * 32-bit, so we only need one 32-bit wide nop instruction 132 | * when there's no preload, for a total size of two words. 133 | */ 134 | .if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \ 135 | (\bytes_to_go % \line_size) == 0 136 | pld [r1, ip] 137 | ldmia r1!, {r3, r4, r5, r6} 138 | stmia r0!, {r3, r4, r5, r6} 139 | .else 140 | ldmia r1!, {r3, r4, r5, r6} 141 | W( nop ) 142 | stmia r0!, {r3, r4, r5, r6} 143 | .endif 144 | #else 145 | /* 146 | * When ARM mode is enabled, every instruction is one word, 147 | * so make sure the entire block is four instructions. 148 | */ 149 | .if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \ 150 | (\bytes_to_go % \line_size) == 0 151 | pld [r1, ip] 152 | .else 153 | nop 154 | .endif 155 | ldmia r1!, {r3, r4, r5, r6} 156 | nop 157 | stmia r0!, {r3, r4, r5, r6} 158 | #endif 159 | .endm 160 | 161 | 162 | /* Helper macro implementing unaligned copy. */ 163 | 164 | .macro unaligned_copy shift, line_size, prefetch_distance, write_align, \ 165 | aligned_access 166 | /* 167 | * ip is the aligned source base address. 168 | * r3 is a word of data from the source. 169 | */ 170 | .if \write_align > 0 171 | cmp r2, #(32 + \write_align - 4) 172 | .else 173 | cmp r2, #32 174 | .endif 175 | push {r5} 176 | blt 55f 177 | subs r2, r2, #32 178 | 179 | /* Handle write alignment. */ 180 | .if \write_align > 0 181 | .if \write_align == 8 182 | tst r0, #4 183 | mov r4, r3, lsr #\shift 184 | ldrne r3, [r1], #4 185 | subne r2, r2, #4 186 | orrne r4, r4, r3, lsl #(32 - \shift) 187 | strne r4, [r0], #4 188 | .else 189 | ands r5, r0, #(\write_align - 1) 190 | rsb r5, r5, #\write_align 191 | beq 59f 192 | sub r2, r2, r5 193 | 194 | 58: movs r4, r3, lsr #\shift 195 | ldr r3, [r1], #4 196 | subs r5, r5, #4 197 | orr r4, r4, r3, lsl #(32 - \shift) 198 | str r4, [r0], #4 199 | bgt 58b 200 | 59: 201 | .endif 202 | .endif 203 | 204 | /* 205 | * Assume a preload at aligned base + line_size will 206 | * be useful. 207 | */ 208 | pld [ip, #\line_size] 209 | push {r6-r11} 210 | mov r11, r3 211 | 212 | mov r4, ip 213 | add r5, r1, #(\prefetch_distance * \line_size) 214 | subs r2, r2, #(\prefetch_distance * \line_size) 215 | bic r3, r5, #31 216 | add r4, r4, #(2 * \line_size) 217 | blt 54f 218 | cmp r4, r3 219 | sub ip, r3, r1 220 | /* 221 | * "Catch-up" the early preloads (which have been performed up 222 | * to aligned source address + line_size) to the preload offset 223 | * used in the main loop. 224 | */ 225 | bge 52f 226 | 51: adds r4, r4, #\line_size /* Thumb16 */ 227 | cmp r4, r3 228 | pld [r4, #(- \line_size)] 229 | blt 51b 230 | 52: 231 | /* 232 | * Note that when L1_CACHE_BYTES is 64, we are 233 | * prefetching every 32 bytes. Although not optimal 234 | * there doesn't seem to be big penalty for the extra 235 | * preload instructions and it prevents greater 236 | * code size and complexity. 237 | */ 238 | 53: pld [r1, ip] 239 | 54: 240 | ldmia r1!, {r4-r7} 241 | mov r3, r11, lsr #\shift 242 | ldmia r1!, {r8-r11} 243 | orr r3, r3, r4, lsl #(32 - \shift) 244 | movs r4, r4, lsr #\shift /* Thumb16 */ 245 | orr r4, r4, r5, lsl #(32 - \shift) 246 | movs r5, r5, lsr #\shift /* Thumb16 */ 247 | orr r5, r5, r6, lsl #(32 - \shift) 248 | movs r6, r6, lsr #\shift /* Thumb16 */ 249 | orr r6, r6, r7, lsl #(32 - \shift) 250 | movs r7, r7, lsr #\shift /* Thumb16 */ 251 | orr r7, r7, r8, lsl #(32 - \shift) 252 | mov r8, r8, lsr #\shift 253 | orr r8, r8, r9, lsl #(32 - \shift) 254 | mov r9, r9, lsr #\shift 255 | orr r9, r9, r10, lsl #(32 - \shift) 256 | mov r10, r10, lsr #\shift 257 | orr r10, r10, r11, lsl #(32 - \shift) 258 | subs r2, r2, #32 259 | stmia r0!, {r3-r10} 260 | bge 53b 261 | cmn r2, #(\prefetch_distance * \line_size) 262 | bge 54b 263 | /* Correct the count. */ 264 | adds r2, r2, #(\prefetch_distance * \line_size + 32) 265 | 266 | mov r3, r11 267 | pop {r6-r11} 268 | 269 | 55: bics r5, r2, #3 270 | beq 57f 271 | 272 | 56: movs r4, r3, lsr #\shift 273 | ldr r3, [r1], #4 274 | subs r5, r5, #4 275 | orr r4, r4, r3, lsl #(32 - \shift) 276 | str r4, [r0], #4 277 | bgt 56b 278 | 279 | 57: pop {r5} 280 | pop {r4} 281 | subs r1, r1, #((32 - \shift) / 8) 282 | .if \aligned_access == 1 283 | b 7b 284 | .else 285 | b 3b 286 | .endif 287 | .endm 288 | 289 | 290 | /* The main memcpy function macro. */ 291 | 292 | .macro memcpy_variant line_size, prefetch_distance, write_align, \ 293 | aligned_access 294 | 295 | .if \aligned_access == 1 296 | cmp r2, #3 297 | .else 298 | NO_FAST_PATH( cmp r2, #3 ) 299 | .endif 300 | orr r3, r0, r1 301 | .if \aligned_access == 1 302 | push {r0} 303 | ble 7f 304 | .else 305 | NO_FAST_PATH( push {r0} ) 306 | NO_FAST_PATH( ble 3f ) 307 | .endif 308 | bic ip, r1, #(\line_size - 1) 309 | tst r3, #3 310 | pld [ip] 311 | .if \aligned_access == 1 312 | FAST_PATH( bne 30f ) 313 | .else 314 | FAST_PATH( push {r0} ) 315 | FAST_PATH( bne 7f ) /* Unaligned source or destination. */ 316 | .endif 317 | FAST_PATH( cmp r2, #FAST_PATH_THRESHOLD ) 318 | FAST_PATH( bgt 10f ) 319 | NO_FAST_PATH( bne 30f ) 320 | #if FAST_PATH_THRESHOLD == 0 321 | /* 322 | * When the fast path is disabled, check whether there are 323 | * enough bytes for alignment, and jump to the main handling 324 | * code for larger sizes. 325 | */ 326 | .if \write_align > 0 327 | cmp r2, #(\write_align - 4) 328 | bge 10f 329 | .endif 330 | push {r4} 331 | b 18f 332 | #endif 333 | 334 | /* 335 | * Fast path for aligned copies of size <= FAST_PATH_THRESHOLD. 336 | */ 337 | #if FAST_PATH_THRESHOLD > 0 338 | #if SMALL_SIZE_THRESHOLD == 15 339 | bics r3, r2, #15 340 | pld [ip, #\line_size] 341 | /* Jump for small sizes <= 15 bytes. */ 342 | beq 5f 343 | #else 344 | cmp r2, #SMALL_SIZE_THRESHOLD 345 | pld [ip, #\line_size] 346 | /* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */ 347 | ble 5f 348 | bic r3, r2, #15 349 | #endif 350 | 351 | 9: /* 352 | * This is the entry-point into the fast path from 353 | * an unaligned request that has been aligned. 354 | */ 355 | push {r4, r5, r6} 356 | 357 | /* 358 | * Use a heuristic to determine whether the preload 359 | * at aligned_base + 2 * line_size will be useful. 360 | */ 361 | .if EARLY_PREFETCHES >= 3 362 | cmp r2, #(2 * \line_size - \line_size / 2) 363 | .endif 364 | add r5, ip, #(EARLY_PREFETCHES * \line_size) 365 | .if EARLY_PREFETCHES >= 3 366 | blt 1f 367 | .endif 368 | .if EARLY_PREFETCHES == 3 369 | pld [ip, #(2 * \line_size)] ) 370 | .endif 371 | .if EARLY_PREFETCHES == 4 372 | cmp r2, #(3 * \line_size - \line_size / 2) 373 | pld [ip, #(2 * \line_size)] 374 | blt 1f 375 | pld [ip, #(3 * \line_size)] 376 | .endif 377 | .if EARLY_PREFETCHES == 5 378 | cmp r2, #(3 * \line_size - \line_size / 2) 379 | pld [ip, #(2 * \line_size)] 380 | blt 1f 381 | cmp r2, #(4 * \line_size - \line_size / 2) 382 | pld [ip, #(3 * \line_size)] 383 | blt 1f 384 | pld [ip, #(4 * \line_size)] 385 | .endif 386 | 387 | 1: /* 388 | * Set r5 so that the next preload will occur 389 | * exactly at aligned_base + EARLY_PREFETCHES * 390 | * line_size. For example, if line_size is 64 391 | * and the number of bytes is 240, the next preload 392 | * will occur after processing 48 bytes, which is derived 393 | * from the formula r3 & (line_size - 1), 394 | * where r3 is equal to number_of_bytes & (~15). 395 | */ 396 | rsb r4, r3, #256 397 | subs r5, r5, r1 398 | and ip, r3, #(\line_size - 1) 399 | subs r2, r2, r3 /* Thumb16 */ 400 | THUMB( lsrs r4, r4, #1 ) /* Thumb16 */ 401 | sub ip, r5, ip 402 | add pc, pc, r4 403 | nop 404 | /* >= 256 bytes to go. */ 405 | copy_16_bytes 256, \line_size, \prefetch_distance 406 | /* >= 240 bytes go. */ 407 | copy_16_bytes 240, \line_size, \prefetch_distance 408 | /* >= 224 bytes to go. */ 409 | copy_16_bytes 224, \line_size, \prefetch_distance 410 | /* >= 204 bytes go. */ 411 | copy_16_bytes 204, \line_size, \prefetch_distance 412 | /* >= 192 bytes to go. */ 413 | copy_16_bytes 192, \line_size, \prefetch_distance 414 | /* >= 176 bytes go. */ 415 | copy_16_bytes 176, \line_size, \prefetch_distance 416 | /* >= 160 bytes to go. */ 417 | copy_16_bytes 160, \line_size, \prefetch_distance 418 | /* >= 144 bytes go. */ 419 | copy_16_bytes 144, \line_size, \prefetch_distance 420 | /* >= 128 bytes to go. */ 421 | copy_16_bytes 128, \line_size, \prefetch_distance 422 | /* >= 112 bytes go. */ 423 | copy_16_bytes 112, \line_size, \prefetch_distance 424 | /* >= 96 bytes to go. */ 425 | copy_16_bytes 96, \line_size, \prefetch_distance 426 | /* >= 80 bytes to go. */ 427 | copy_16_bytes 80, \line_size, \prefetch_distance 428 | /* >= 64 bytes to go. */ 429 | copy_16_bytes 64, \line_size, \prefetch_distance 430 | /* >= 48 bytes to go. */ 431 | copy_16_bytes 48, \line_size, \prefetch_distance 432 | /* >= 32 bytes to go. */ 433 | copy_16_bytes 32, \line_size, \prefetch_distance 434 | /* At this point there are 16 to 31 bytes to go. */ 435 | tst r2, #15 436 | ldmia r1!, {r3, r4, r5, r6} 437 | cmpne r2, #8 438 | /* 439 | * If r2 == 8, we need to clear the eq flag while 440 | * making sure carry remains set. 441 | */ 442 | tsteq r2, #15 443 | stmia r0!, {r3, r4, r5, r6} 444 | /* 445 | * The equal flag is set if there are no bytes left. 446 | * The carry flag is set is there are >= 8 bytes left. 447 | */ 448 | pop {r4, r5, r6} 449 | beq 4f 450 | 451 | 2: 452 | /* 453 | * ARM mode imposes restrictions on the registers used 454 | * in double-word loads and stored so we have to use 455 | * single-word operations. 456 | */ 457 | .if \aligned_access == 0 458 | ARM( ldrcs r3, [r1], #4 ) 459 | ARM( ldrcs ip, [r1], #4 ) 460 | ARM( strcs r3, [r0], #4 ) 461 | ARM( strcs ip, [r0], #4 ) 462 | THUMB( ldrdcs r3, ip, [r1], #8 ) 463 | THUMB( strdcs r3, ip, [r0], #8 ) 464 | .else 465 | ldrcs r3, [r1], #4 466 | ldrcs ip, [r1], #4 467 | strcs r3, [r0], #4 468 | strcs ip, [r0], #4 469 | .endif 470 | tst r2, #4 471 | ldrne ip, [r1], #4 472 | strne ip, [r0], #4 473 | tst r2, #3 474 | popeq {r0} 475 | bxeq lr 476 | 477 | /* 478 | * Handle the last up to three bytes. Unaligned access 479 | * make take place if source or destination is not 480 | * half-word aligned. 481 | */ 482 | 3: movs r2, r2, lsl #31 483 | ldrhcs r3, [r1], #2 484 | strhcs r3, [r0], #2 485 | ldrbne r3, [r1], #1 486 | strbne r3, [r0], #1 487 | 4: pop {r0} 488 | bx lr 489 | 490 | 5: /* 491 | * Sizes <= SMALL_SIZE_THRESHOLD bytes, both source and 492 | * destination aligned. 493 | */ 494 | #if SMALL_SIZE_THRESHOLD <= 15 495 | cmp r2, #8 /* cs if r2 >= 8. */ 496 | b 2b 497 | #else 498 | 101: tst r2, #4 499 | ldrne r3, [r1], #4 500 | subne r2, r2, #4 501 | strne r3, [r0], #4 502 | cmp r2, #8 503 | blt 3b 504 | 6: cmp r2, #16 505 | ldr r3, [r1], #4 506 | ldr ip, [r1], #4 507 | str r3, [r0], #4 508 | sub r2, r2, #8 509 | str ip, [r0], #4 510 | bge 6b 511 | cmp r2, #0 512 | popeq {r0} 513 | bxeq lr 514 | b 3b 515 | #endif 516 | 517 | #endif /* FAST_PATH_THRESHOLD > 0 */ 518 | 519 | .if \aligned_access == 1 520 | /* 521 | * Handle the last up to three bytes avoiding 522 | * unaligned memory access. 523 | */ 524 | 7: movs r2, r2, lsl #31 525 | ldrbcs r3, [r1], #1 526 | ldrbcs ip, [r1], #1 527 | strbcs r3, [r0], #1 528 | strbcs ip, [r0], #1 529 | ldrbne r3, [r1], #1 530 | strbne r3, [r0], #1 531 | pop {r0} 532 | bx lr 533 | .endif 534 | 535 | #if FAST_PATH_THRESHOLD > 0 536 | .if \aligned_access == 0 537 | 7: /* 538 | * Unaligned source or destination. There are seperate small 539 | * size thresholds for when both source and destination are 540 | * unaligned and the other case. 541 | */ 542 | tst r0, #3 543 | mov r3, #UNALIGNED_SMALL_SIZE_THRESHOLD 544 | tstne r1, #3 545 | movne r3, #BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 546 | cmp r2, r3 547 | bgt 30f 548 | 549 | /* Small sizes, unaligned case. Use single word load/stores. */ 550 | #if SMALL_SIZE_THRESHOLD >= 16 551 | /* Use the identical code path already defined above. */ 552 | b 101b 553 | #else 554 | tst r2, #4 555 | ldrne r3, [r1], #4 556 | subne r2, r2, #4 557 | strne r3, [r0], #4 558 | cmp r2, #8 559 | blt 3b 560 | 8: cmp r2, #16 561 | ldr r3, [r1], #4 562 | ldr ip, [r1], #4 563 | str r3, [r0], #4 564 | sub r2, r2, #8 565 | str ip, [r0], #4 566 | bge 8b 567 | b 3b 568 | #endif 569 | .endif 570 | #endif /* FAST_PATH_THRESHOLD > 0 */ 571 | 572 | 10: /* 573 | * This is the start of the handling of larger sizes for 574 | * aligned copies. 575 | * 576 | * Size > FAST_PATH_THRESHOLD (256). 577 | * ip is the line_sized aligned source address for preloads. 578 | */ 579 | 580 | .if \write_align >= 16 581 | ands r3, r0, #(\write_align - 1) 582 | push {r4} 583 | rsb r3, r3, #\write_align 584 | beq 17f 585 | push {lr} 586 | bl 20f 587 | pop {lr} 588 | 17: 589 | .elseif \write_align == 8 590 | /* 591 | * For write alignment of 8, it is quickest to do a simple 592 | * conditional load/store. 593 | */ 594 | tst r0, #4 595 | push {r4} 596 | ldrne r3, [r1], #4 597 | subne r2, r2, #4 598 | strne r3, [r0], #4 599 | .else 600 | push {r4} 601 | .endif 602 | 603 | 18: 604 | .if (FAST_PATH_THRESHOLD - (\write_align - 4)) < \line_size 605 | cmp r2, #\line_size 606 | blt 15f 607 | .endif 608 | subs r2, r2, #\line_size 609 | 610 | 16: /* 611 | * This is the entry-point when source and destination were 612 | * initially unaligned but are now aligned because they had 613 | * the same alignment within a word. Write alignment and 614 | * size check has already been handled. 615 | */ 616 | 617 | push {r5-r11} 618 | 619 | /* 620 | * Assume a preload at aligned base + line_size will 621 | * be useful. 622 | */ 623 | mov r4, ip 624 | pld [ip, #\line_size] 625 | add r5, r1, #(\prefetch_distance * \line_size) 626 | subs r2, r2, #(\prefetch_distance * \line_size) 627 | bic r3, r5, #(\line_size - 1) 628 | add r4, r4, #(2 * \line_size) 629 | blt 14f 630 | cmp r4, r3 631 | sub ip, r3, r1 632 | /* 633 | * "Catch-up" the early preloads (which have been performed up 634 | * to aligned source address + line_size) to the preload offset 635 | * used in the main loop. 636 | */ 637 | bge 12f 638 | 11: adds r4, r4, #\line_size /* Thumb16 */ 639 | cmp r4, r3 640 | pld [r4, #(- \line_size)] 641 | blt 11b 642 | 12: 643 | 644 | /* 645 | * The main loop for large sizes. Copy 32 bytes at a time 646 | * using ldmia/stmia while prefetching a 32-byte aligned 647 | * address for line size 32, or 64 bytes at a time while 648 | * prefetching a 64-byte aligned address for line size 64. 649 | */ 650 | 13: pld [r1, ip] 651 | 14: 652 | .if \line_size == 32 653 | ldmia r1!, {r4-r7} 654 | subs r2, r2, #32 655 | ldmia r1!, {r8-r11} 656 | stmia r0!, {r4-r7} 657 | stmia r0!, {r8-r11} 658 | .else 659 | ldmia r1!, {r4-r11} 660 | subs r2, r2, #64 661 | stmia r0!, {r4-r11} 662 | ldmia r1!, {r4-r11} 663 | stmia r0!, {r4-r11} 664 | .endif 665 | bge 13b 666 | cmn r2, #(\prefetch_distance * \line_size) 667 | bge 14b 668 | /* Correct the count. */ 669 | adds r2, r2, #((\prefetch_distance + 1) * \line_size) 670 | pop {r5-r11} 671 | 672 | 15: ands r3, r2, #60 673 | .if \write_align <= 8 674 | /* 675 | * When the subroutine is not used for write alignment, the 676 | * subroutine will only be called once, so branch without 677 | * linking. 678 | */ 679 | bne 20f 680 | 19: 681 | .else 682 | mov ip, lr 683 | blne 20f 684 | mov lr, ip 685 | .endif 686 | pop {r4} 687 | #if FAST_PATH_THRESHOLD > 0 688 | cmp r2, #0 689 | bne 3b 690 | #else 691 | ARM( cmp r2, #0 ) 692 | ARM( beq 4f ) 693 | THUMB( cbz r2, 4f ) 694 | /* Handle the last up to three bytes. */ 695 | 3: movs r2, r2, lsl #31 696 | ldrhcs r3, [r1], #2 697 | strhcs r3, [r0], #2 698 | ldrbne r3, [r1], #1 699 | strbne r3, [r0], #1 700 | 4: 701 | #endif 702 | pop {r0} 703 | bx lr 704 | 705 | /* 706 | * Subroutine that copies a multiple of 4 bytes of size 707 | * r3 from 0 to 64 or 32 bytes. r2 is decremented by the 708 | * number of bytes copied. 709 | */ 710 | 20: tst r3, #4 711 | sub r2, r2, r3 712 | ldrne r4, [r1], #4 713 | subne r3, r3, #4 714 | strne r4, [r0], #4 715 | .if \write_align <= 32 && \line_size == 32 716 | rsb r3, r3, #32 717 | .else 718 | rsb r3, r3, #64 719 | .endif 720 | /* 721 | * These ldmia/stmia instructions are 16-bit on Thumb2, 722 | * 32-bit on ARM. 723 | */ 724 | THUMB( lsrs r3, r3, #1 ) 725 | add pc, pc, r3 726 | nop 727 | ldmia r1!, {r3, r4} 728 | stmia r0!, {r3, r4} 729 | ldmia r1!, {r3, r4} 730 | stmia r0!, {r3, r4} 731 | ldmia r1!, {r3, r4} 732 | stmia r0!, {r3, r4} 733 | ldmia r1!, {r3, r4} 734 | stmia r0!, {r3, r4} 735 | .if \write_align > 32 || \line_size > 32 736 | ldmia r1!, {r3, r4} 737 | stmia r0!, {r3, r4} 738 | ldmia r1!, {r3, r4} 739 | stmia r0!, {r3, r4} 740 | ldmia r1!, {r3, r4} 741 | stmia r0!, {r3, r4} 742 | ldmia r1!, {r3, r4} 743 | stmia r0!, {r3, r4} 744 | .endif 745 | .if \write_align <= 8 746 | b 19b 747 | .else 748 | mov pc, lr 749 | .endif 750 | 751 | 30: /* 752 | * Unaligned case. Align the destination. 753 | * Number of bytes is > UNALIGNED_SMALL_SIZE_THRESHOLD. 754 | * Note: This may use unaligned access. 755 | * ip is the line_size aligned source address for preloads. 756 | */ 757 | ands r3, r0, #3 758 | push {r4} 759 | andeq r3, r1, #3 760 | beq 40f /* Destination is aligned but source is not. */ 761 | /* Align the destination. */ 762 | cmp r3, #2 763 | .if \aligned_access == 1 764 | ldrble r4, [r1], #1 765 | ldrble r3, [r1], #1 766 | suble r2, r2, #2 767 | strble r4, [r0], #1 768 | strble r3, [r0], #1 769 | .else 770 | ldrhle r4, [r1], #2 771 | suble r2, r2, #2 772 | strhle r4, [r0], #2 773 | .endif 774 | ldrbne r4, [r1], #1 775 | subne r2, r2, #1 776 | strbne r4, [r0], #1 777 | ands r3, r1, #3 778 | bne 40f /* Destination is aligned but source is not. */ 779 | 780 | #if 0 && FAST_PATH_THRESHOLD > 0 781 | /* 782 | * Source and destination are now aligned. 783 | * Now recreate the situation of a word-aligned memcpy 784 | * with the current source and destination, 785 | * which may require an extra preload instruction. 786 | * 787 | * This path is currently disabled disabled in favour 788 | * of the one below this which does write alignment and 789 | * jumps into the main loop for larger sizes. 790 | */ 791 | bic r3, r1, #(\line_size - 1) 792 | pop {r4} 793 | cmp r3, ip 794 | THUMB( pldne [r3] ) 795 | THUMB( cmp r2, #FAST_PATH_THRESHOLD ) 796 | THUMB( mov ip, r3 ) 797 | ARM( beq 31f ) 798 | ARM( pld [r3] ) 799 | ARM( mov ip, r3 ) 800 | 31: ARM( cmp r2, #FAST_PATH_THRESHOLD ) 801 | bgt 10b 802 | 803 | /* 804 | * Recreate the fast path small size check here, 805 | * but only if it necessary. 806 | */ 807 | .if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) <= SMALL_SIZE_THRESHOLD || 808 | \aligned_access == 1 809 | cmp r2, #SMALL_SIZE_THRESHOLD 810 | pld [ip, #\line_size] 811 | /* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */ 812 | ble 5b 813 | .else 814 | pld [ip, #\line_size] 815 | .endif 816 | bic r3, r2, #15 817 | b 9b 818 | 819 | #else 820 | /* 821 | * Source and destination are now aligned. Check carefully 822 | * whether there are enough bytes to do alignment. 823 | */ 824 | .if \write_align > 0 825 | .if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < (\write_align - 4) \ 826 | || \aligned_access == 1 827 | cmp r2, #(\write_align - 4) 828 | blt 31f 829 | .endif 830 | .if \write_align == 8 831 | /* 832 | * For write alignment of 8, it is quickest to do a simple 833 | * conditional load/store. 834 | */ 835 | tst r0, #4 836 | ldrne r3, [r1], #4 837 | subne r2, r2, #4 838 | strne r3, [r0], #4 839 | .else 840 | ands r3, r0, #(\write_align - 1) 841 | rsb r3, r3, #\write_align 842 | beq 31f 843 | push {lr} 844 | bl 20b 845 | pop {lr} 846 | .endif 847 | 848 | 31: /* 849 | * Check whether there are enough bytes to do one iteration 850 | * of the main loop. 851 | */ 852 | .if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3 - (\write_align - 4)) < \line_size \ 853 | || \aligned_access == 1 854 | cmp r2, #\line_size 855 | blt 15b 856 | .endif 857 | subs r2, r2, #\line_size 858 | .else 859 | /* 860 | * No write alignment. Only have to check for enough bytes to 861 | * do one iteration of the main loop. 862 | */ 863 | 864 | .if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < \line_size \ 865 | || \aligned_access == 1 866 | cmp r2, #\line_size 867 | blt 15b 868 | .endif 869 | subs r2, r2, #\line_size 870 | .endif 871 | b 16b 872 | #endif 873 | 874 | 40: /* 875 | * Unaligned case. Size is > SMALL_SIZE_THRESHOLD - 3. 876 | */ 877 | bic r1, r1, #3 878 | cmp r3, #2 879 | ldr r3, [r1], #4 880 | beq 41f 881 | bgt 42f 882 | 883 | unaligned_copy 8, \line_size, \prefetch_distance, \ 884 | \write_align, \aligned_access 885 | 886 | 41: unaligned_copy 16, \line_size, \prefetch_distance, \ 887 | \write_align, \aligned_access 888 | 889 | 42: unaligned_copy 24, \line_size, \prefetch_distance, \ 890 | \write_align, \aligned_access 891 | 892 | .endm 893 | 894 | /* 895 | * The following is a NEON-based memcpy implementation that may use unaligned 896 | * access, but NEON instruction addresses are always at least element aligned. 897 | * It is optimized for both Thumb2 (CONFIG_THUMB) and ARM mode. 898 | * 899 | * - line_size is the cache line size used for prefetches. Must be 64 or 32. 900 | * - prefetch_distance is the number of cache lines to look ahead and must be 901 | * >= 2, or 0 to disable prefetching in the main copying loop. 902 | * - early_prefetch indicates whether to perform early preloads. Must be 0 or 1. 903 | * When prefetch_distance > 0, early_prefetch should be 1. To remove all PLD 904 | * instructions altogether, set both prefetch_distance and early_prefetch 905 | * to 0. 906 | */ 907 | 908 | .macro neon_memcpy_variant line_size, prefetch_distance, early_prefetch 909 | 910 | cmp r2, #3 911 | .if \prefetch_distance > 0 || \early_prefetch == 1 912 | push {r0} 913 | .else 914 | mov ip, r0 915 | .endif 916 | orr r3, r0, r1 917 | ble 8f 918 | .if \prefetch_distance > 0 || \early_prefetch == 1 919 | bic ip, r1, #(\line_size - 1) 920 | .endif 921 | tst r3, #3 922 | .if \early_prefetch == 1 923 | pld [ip] 924 | .endif 925 | bne 10f /* Unaligned source or destination. */ 926 | push {r4} 927 | 928 | /* Aligned source and destination. */ 929 | 1: cmp r2, #256 930 | /* 931 | * Jump to word-aligned NEON fast path <= 256 bytes. 932 | */ 933 | ble 18f 934 | subs r2, r2, #\line_size 935 | 936 | /* Align to a 32-byte boundary. */ 937 | #ifdef CONFIG_THUMB 938 | /* 939 | * Use conditional NEON instructions when 940 | * available (Thumb2 mode) 941 | */ 942 | ands r4, r0, #31 943 | rsb r4, r4, #32 944 | beq 31f 945 | tst r4, #4 946 | sub r2, r2, r4 947 | ldrne r3, [r1 :32], #4 948 | strne r3, [r0 :32], #4 949 | tst r4, #8 950 | vld1ne.32 {d0}, [r1]! 951 | vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]! 952 | cmp r4, #16 953 | vld1ge.32 {d2, d3}, [r1]! 954 | vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]! 955 | #else 956 | /* 957 | * Otherwise, branch into a series of single 958 | * loads/stores. 959 | */ 960 | ands r4, r0, #31 961 | beq 31f 962 | rsb r3, r4, #32 963 | lsl r4, r4, #1 964 | sub r2, r2, r3 965 | add pc, pc, r4 966 | nop 967 | ldr r3, [r1], #4 968 | str r3, [r0], #4 969 | ldr r4, [r1], #4 970 | str r4, [r0], #4 971 | ldr r3, [r1], #4 972 | str r3, [r0], #4 973 | ldr r4, [r1], #4 974 | str r4, [r0], #4 975 | ldr r3, [r1], #4 976 | str r3, [r0], #4 977 | ldr r4, [r1], #4 978 | str r4, [r0], #4 979 | ldr r3, [r1], #4 980 | str r3, [r0], #4 981 | ldr r4, [r1], #4 982 | str r4, [r0], #4 983 | #endif 984 | cmp r2, #0 985 | addlt r2, r2, \line_size 986 | blt 6f 987 | 988 | 31: 989 | .if \early_prefetch == 1 990 | pld [ip, #\line_size] 991 | .endif 992 | .if \prefetch_distance > 0 993 | /* 994 | * Assume a preload at aligned base + line_size will 995 | * be useful. 996 | */ 997 | push {r5} 998 | mov r4, ip 999 | add r5, r1, #(\prefetch_distance * \line_size) 1000 | subs r2, r2, #(\prefetch_distance * \line_size) 1001 | bic r3, r5, #(\line_size - 1) 1002 | add r4, r4, #(2 * \line_size) 1003 | blt 5f 1004 | cmp r4, r3 1005 | sub ip, r3, r1 1006 | /* 1007 | * "Catch-up" the early preloads (which have been performed up 1008 | * to aligned source address + line_size) to the preload offset 1009 | * used in the main loop. 1010 | */ 1011 | bge 3f 1012 | 2: adds r4, r4, #\line_size /* Thumb16 */ 1013 | cmp r4, r3 1014 | pld [r4, #(- \line_size)] 1015 | blt 2b 1016 | 3: 1017 | .endif 1018 | 1019 | sub ip, ip, #\line_size 1020 | 4: 1021 | /* 1022 | * Since the destination is 32-byte aligned, 1023 | * specify 256-bit alignment for the NEON stores. 1024 | */ 1025 | .if \line_size == 32 1026 | vld1.32 {d0-d3}, [r1]! 1027 | subs r2, r2, #32 1028 | .if \prefetch_distance > 0 1029 | pld [r1, ip] 1030 | .endif 1031 | vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! 1032 | .else /* line_size == 64 */ 1033 | vld1.32 {d0-d3}, [r1]! 1034 | vld1.32 {d4-d7}, [r1]! 1035 | .if \prefetch_distance > 0 1036 | pld [r1, ip] 1037 | .endif 1038 | vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! 1039 | subs r2, r2, #64 1040 | vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]! 1041 | .endif 1042 | bge 4b 1043 | .if \prefetch_distance > 0 1044 | 5: 1045 | .if \line_size == 32 1046 | vld1.32 {d0-d3}, [r1]! 1047 | subs r2, r2, #32 1048 | vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! 1049 | .else /* line_size == 64 */ 1050 | vld1.32 {d0-d3}, [r1]! 1051 | vld1.32 {d4-d7}, [r1]! 1052 | vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! 1053 | subs r2, r2, #64 1054 | vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]! 1055 | .endif 1056 | cmn r2, #(\prefetch_distance * \line_size) 1057 | bge 5b 1058 | .endif 1059 | /* Correct the count. */ 1060 | 23: adds r2, r2, #((\prefetch_distance + 1) * \line_size) 1061 | .if \prefetch_distance > 0 1062 | pop {r5} 1063 | .endif 1064 | 1065 | /* 1066 | * Process the last 0-(line_size - 1) bytes, destination 1067 | * 32-byte aligned, source word aligned. 1068 | */ 1069 | 6: 1070 | #ifdef CONFIG_THUMB 1071 | /* 1072 | * Use conditional NEON instructions when 1073 | * available (Thumb2 mode). 1074 | */ 1075 | .if \line_size == 64 1076 | cmp r2, #32 1077 | vld1ge.32 {d0-d3}, [r1]! 1078 | vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]! 1079 | tst r2, #16 1080 | vld1ne.32 {d0, d1}, [r1]! 1081 | vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]! 1082 | .else 1083 | cmp r2, #16 1084 | vld1ge.32 {d0, d1}, [r1]! 1085 | vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]! 1086 | .endif 1087 | tst r2, #8 1088 | vld1ne.32 {d2}, [r1]! 1089 | vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]! 1090 | tst r2, #4 1091 | ldrne r3, [r1], #4 1092 | strne r3, [r0 :32], #4 1093 | 1094 | pop {r4} 1095 | #else 1096 | /* 1097 | * Just use the world-aligned tail code if we 1098 | * don't have Thumb2. 1099 | */ 1100 | b 17f 1101 | #endif 1102 | 1103 | /* 1104 | * Handle the last up to three bytes. Unaligned access 1105 | * may take place if source or destination is not 1106 | * half-word aligned. 1107 | */ 1108 | 8: movs r2, r2, lsl #31 1109 | ldrhcs r3, [r1], #2 1110 | strhcs r3, [r0], #2 1111 | ldrbne r3, [r1], #1 1112 | strbne r3, [r0] 1113 | 9: 1114 | .if \prefetch_distance > 0 || \early_prefetch == 1 1115 | pop {r0} 1116 | .else 1117 | mov r0, ip 1118 | .endif 1119 | bx lr 1120 | 1121 | 10: /* 1122 | * Unaligned case. Align the destination. 1123 | * Number of bytes is > 3. 1124 | * Note: This may use unaligned access. 1125 | * ip is the line_size aligned source address for preloads. 1126 | */ 1127 | cmp r2, #64 1128 | push {r4} 1129 | /* For small sizes < 64 bytes just use the unaligned tail code. */ 1130 | blt 16f 1131 | ands r3, r0, #3 1132 | beq 11f /* Destination is aligned but source is not. */ 1133 | /* Align the destination. */ 1134 | cmp r3, #2 1135 | ldrbne r4, [r1], #1 1136 | subne r2, r2, #1 1137 | strbne r4, [r0], #1 1138 | ldrhle r4, [r1], #2 1139 | suble r2, r2, #2 1140 | strhle r4, [r0], #2 1141 | tst r1, #3 1142 | beq 1b /* Destination and source are now aligned. */ 1143 | /* Destination is now aligned to a word boundary. */ 1144 | 11: 1145 | cmp r2, #64 1146 | /* 1147 | * Jump to non-aligned NEON tail code for <= 64 bytes. 1148 | */ 1149 | ble 16f 1150 | subs r2, r2, #\line_size 1151 | 1152 | /* Align destination to a 32-byte boundary. */ 1153 | ands r4, r0, #31 1154 | rsb r4, r4, #32 1155 | beq 20f 1156 | tst r4, #4 1157 | sub r2, r2, r4 1158 | ldrne r3, [r1 :8], #4 /* Unaligned access. */ 1159 | strne r3, [r0 :32], #4 1160 | tst r4, #8 1161 | #ifdef CONFIG_THUMB 1162 | /* 1163 | * Use conditional NEON instructions when 1164 | * available (Thumb2 mode) 1165 | */ 1166 | vld1ne.8 {d0}, [r1]! 1167 | vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]! 1168 | cmp r4, #16 1169 | vld1ge.8 {d2, d3}, [r1]! 1170 | vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]! 1171 | #else 1172 | beq 31f 1173 | vld1.8 {d0}, [r1]! 1174 | vst1.64 {d0}, [r0 NEON_ALIGN(64)]! 1175 | 31: cmp r4, #16 1176 | blt 32f 1177 | vld1.8 {d2, d3}, [r1]! 1178 | vst1.64 {d2, d3}, [r0 NEON_ALIGN(128)]! 1179 | 32: 1180 | #endif 1181 | cmp r2, #0 1182 | addlt r2, r2, #\line_size 1183 | blt 16f 1184 | 20: 1185 | 1186 | .if \early_prefetch == 1 1187 | pld [ip, #\line_size] 1188 | .endif 1189 | .if \prefetch_distance > 0 1190 | /* 1191 | * Assume a preload at aligned base + line_size will 1192 | * be useful. 1193 | */ 1194 | push {r5} 1195 | mov r4, ip 1196 | add r5, r1, #(\prefetch_distance * \line_size) 1197 | subs r2, r2, #(\prefetch_distance * \line_size) 1198 | bic r3, r5, #(\line_size - 1) 1199 | add r4, r4, #(2 * \line_size) 1200 | blt 15f 1201 | cmp r4, r3 1202 | sub ip, r3, r1 1203 | /* 1204 | * "Catch-up" the early preloads (which have been performed up 1205 | * to aligned source address + line_size) to the preload offset 1206 | * used in the main loop. 1207 | */ 1208 | bge 13f 1209 | 12: adds r4, r4, #\line_size /* Thumb16 */ 1210 | cmp r4, r3 1211 | pld [r4, #(- \line_size)] 1212 | blt 12b 1213 | .endif 1214 | 1215 | 13: 1216 | /* 1217 | * Process 64 unaligned bytes from source at a time and copy 1218 | * them to the 32-byte aligned destination. 1219 | */ 1220 | 14: 1221 | .if \prefetch_distance > 0 1222 | pld [r1, ip] 1223 | .endif 1224 | 15: 1225 | .if \line_size == 32 1226 | vld1.8 {d0-d3}, [r1]! 1227 | subs r2, r2, #32 1228 | vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! 1229 | .else /* line_size == 64 */ 1230 | vld1.8 {d0-d3}, [r1]! 1231 | vld1.8 {d4-d7}, [r1]! 1232 | vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! 1233 | subs r2, r2, #64 1234 | vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]! 1235 | .endif 1236 | bge 14b 1237 | .if \prefetch_distance > 0 1238 | cmn r2, #(\prefetch_distance * \line_size) 1239 | bge 15b 1240 | .endif 1241 | /* Correct the count. */ 1242 | adds r2, r2, #((\prefetch_distance + 1) * \line_size) 1243 | .if \prefetch_distance > 0 1244 | pop {r5} 1245 | .endif 1246 | 1247 | /* 1248 | * Handle last 0-(line_size - 1) bytes (destination 32-byte 1249 | * aligned source unaligned). 1250 | */ 1251 | #ifdef CONFIG_THUMB 1252 | /* 1253 | * Use conditional NEON instructions when 1254 | * available (Thumb2 mode) 1255 | */ 1256 | .if \line_size == 64 1257 | cmp r2, #32 1258 | vld1ge.8 {d0-d3}, [r1]! 1259 | vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]! 1260 | tst r2, #16 1261 | vld1ne.8 {d0, d1}, [r1]! 1262 | vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]! 1263 | .else 1264 | cmp r2, #16 1265 | vld1ge.8 {d0, d1}, [r1]! 1266 | vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]! 1267 | .endif 1268 | tst r2, #8 1269 | vld1ne.8 {d2}, [r1]! 1270 | vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]! 1271 | tst r2, #4 1272 | ldrne r3, [r1], #4 1273 | strne r3, [r0 :32], #4 1274 | 1275 | pop {r4} 1276 | b 8b 1277 | #else 1278 | /* 1279 | * Fall through to the code below. It is not entirely 1280 | * optimal because it does not indicate the destination 1281 | * is word aligned. 1282 | */ 1283 | #endif 1284 | 1285 | /* Handle small size of 0-63 bytes, unaligned. */ 1286 | 16: bic r3, r2, #7 1287 | rsb r4, r3, #64 1288 | tst r2, #7 1289 | add pc, pc, r4 1290 | nop 1291 | vld1.8 {d0}, [r1]! 1292 | vst1.8 {d0}, [r0]! 1293 | vld1.8 {d1}, [r1]! 1294 | vst1.8 {d1}, [r0]! 1295 | vld1.8 {d0}, [r1]! 1296 | vst1.8 {d0}, [r0]! 1297 | vld1.8 {d1}, [r1]! 1298 | vst1.8 {d1}, [r0]! 1299 | vld1.8 {d0}, [r1]! 1300 | vst1.8 {d0}, [r0]! 1301 | vld1.8 {d1}, [r1]! 1302 | vst1.8 {d1}, [r0]! 1303 | vld1.8 {d0}, [r1]! 1304 | vst1.8 {d0}, [r0]! 1305 | vld1.8 {d1}, [r1]! 1306 | vst1.8 {d1}, [r0]! 1307 | pop {r4} 1308 | beq 9b 1309 | tst r2, #4 1310 | ldrne r3, [r1 :8], #4 /* Unaligned access. */ 1311 | strne r3, [r0], #4 1312 | b 8b 1313 | 1314 | /* Handle small size of 0-63 bytes, word aligned. */ 1315 | 17: 1316 | #ifdef CONFIG_THUMB 1317 | cmp r2, #32 1318 | vld1ge.32 {d0-d3}, [r1]! 1319 | vst1ge.32 {d0-d3}, [r0]! 1320 | tst r2, #16 1321 | vld1ne.32 {d0, d1}, [r1]! 1322 | vst1ne.32 {d0, d1}, [r0]! 1323 | tst r2, #8 1324 | vld1ne.32 {d2}, [r1]! 1325 | vst1ne.32 {d2}, [r0]! 1326 | tst r2, #7 1327 | #else 1328 | bic r3, r2, #7 1329 | rsb r4, r3, #64 1330 | tst r2, #7 1331 | add pc, pc, r4 1332 | nop 1333 | vld1.32 {d0}, [r1]! 1334 | vst1.32 {d0}, [r0]! 1335 | vld1.32 {d1}, [r1]! 1336 | vst1.32 {d1}, [r0]! 1337 | vld1.32 {d0}, [r1]! 1338 | vst1.32 {d0}, [r0]! 1339 | vld1.32 {d1}, [r1]! 1340 | vst1.32 {d1}, [r0]! 1341 | vld1.32 {d0}, [r1]! 1342 | vst1.32 {d0}, [r0]! 1343 | vld1.32 {d1}, [r1]! 1344 | vst1.32 {d1}, [r0]! 1345 | vld1.32 {d0}, [r1]! 1346 | vst1.32 {d0}, [r0]! 1347 | vld1.32 {d1}, [r1]! 1348 | vst1.32 {d1}, [r0]! 1349 | #endif 1350 | pop {r4} 1351 | beq 9b 1352 | tst r2, #4 1353 | ldrne r3, [r1], #4 1354 | strne r3, [r0], #4 1355 | b 8b 1356 | 1357 | /* 1358 | * Fast path for <= 256 bytes, word aligned. 1359 | * This is hardcoded for a preload offset of 128 bytes, 1360 | * which seems to work well in practice for small sizes. 1361 | */ 1362 | 18: bics r3, r2, #31 1363 | .if \early_prefetch == 1 1364 | pld [ip, #32] 1365 | beq 21f 1366 | pld [ip, #64] 1367 | pld [ip, #96] 1368 | .endif 1369 | rsb r4, r3, #256 1370 | ands r2, r2, #31 1371 | /* 1372 | * Each code block handling 32 bytes is 1373 | * 12 bytes long. 1374 | */ 1375 | lsr r4, r4, #2 1376 | add ip, ip, #128 1377 | add r4, r4, r4, lsr #1 1378 | sub ip, ip, r1 1379 | add pc, pc, r4 1380 | nop 1381 | pld [r1, ip] 1382 | vld1.32 {d0-d3}, [r1]! 1383 | vst1.32 {d0-d3}, [r0]! 1384 | pld [r1, ip] 1385 | vld1.32 {d4-d7}, [r1]! 1386 | vst1.32 {d4-d7}, [r0]! 1387 | pld [r1, ip] 1388 | vld1.32 {d0-d3}, [r1]! 1389 | vst1.32 {d0-d3}, [r0]! 1390 | pld [r1, ip] 1391 | vld1.32 {d4-d7}, [r1]! 1392 | vst1.32 {d4-d7}, [r0]! 1393 | pld [r1, ip] 1394 | vld1.32 {d0-d3}, [r1]! 1395 | vst1.32 {d0-d3}, [r0]! 1396 | W(nop) 1397 | vld1.32 {d4-d7}, [r1]! 1398 | vst1.32 {d4-d7}, [r0]! 1399 | W(nop) 1400 | vld1.32 {d0-d3}, [r1]! 1401 | vst1.32 {d0-d3}, [r0]! 1402 | W(nop) 1403 | vld1.32 {d4-d7}, [r1]! 1404 | vst1.32 {d4-d7}, [r0]! 1405 | beq 19f 1406 | 21: 1407 | #ifdef CONFIG_THUMB 1408 | cmp r2, #16 1409 | vld1ge.32 {d0-d1}, [r1]! 1410 | vst1ge.32 {d0-d1}, [r0]! 1411 | tst r2, #8 1412 | vld1ne.32 {d0}, [r1]! 1413 | vst1ne.32 {d0}, [r0]! 1414 | #else 1415 | cmp r2, #16 1416 | ldmiage r1!, {r3, r4} 1417 | stmiage r0!, {r3, r4} 1418 | ldmiage r1!, {r3, r4} 1419 | stmiage r0!, {r3, r4} 1420 | tst r2, #8 1421 | ldmiane r1!, {r3, r4} 1422 | stmiane r0!, {r3, r4} 1423 | #endif 1424 | tst r2, #4 1425 | pop {r4} 1426 | ldrne r3, [r1], #4 1427 | strne r3, [r0 :32], #4 1428 | and r2, r2, #3 1429 | b 8b 1430 | 19: 1431 | pop {r4} 1432 | .if \prefetch_distance > 0 || \early_prefetch == 1 1433 | pop {r0} 1434 | .else 1435 | mov r0, ip 1436 | .endif 1437 | bx lr 1438 | .endm 1439 | 1440 | 1441 | #if defined(MEMCPY_REPLACEMENT_RPI) || defined(MEMCPY_REPLACEMENT_ARMV7_32) \ 1442 | || defined(MEMCPY_REPLACEMENT_ARMV7_64) || defined(MEMCPY_REPLACEMENT_NEON_32) \ 1443 | || defined(MEMCPY_REPLACEMENT_NEON_64) 1444 | 1445 | #ifdef MEMCPY_REPLACEMENT_RPI 1446 | asm_function memcpy 1447 | memcpy_variant 32, 3, 8, 0 1448 | .endfunc 1449 | #endif 1450 | 1451 | #ifdef MEMCPY_REPLACEMENT_ARMV7_32 1452 | asm_function memcpy 1453 | memcpy_variant 32, 6, 0, 0 1454 | .endfunc 1455 | #endif 1456 | 1457 | #ifdef MEMCPY_REPLACEMENT_ARMV7_64 1458 | asm_function memcpy 1459 | memcpy_variant 64, 3, 0, 0 1460 | .endfunc 1461 | #endif 1462 | 1463 | #ifdef MEMCPY_REPLACEMENT_NEON_32 1464 | asm_function memcpy 1465 | neon_memcpy_variant 32, 6, 1 1466 | .endfunc 1467 | #endif 1468 | 1469 | #ifdef MEMCPY_REPLACEMENT_NEON_64 1470 | asm_function memcpy 1471 | neon_memcpy_variant 64, 3, 1 1472 | .endfunc 1473 | #endif 1474 | 1475 | #ifdef MEMCPY_REPLACEMENT_NEON_AUTO 1476 | asm_function memcpy 1477 | neon_memcpy_variant 32, 0, 1 1478 | .endfunc 1479 | #endif 1480 | 1481 | #else 1482 | 1483 | asm_function memcpy_new_line_size_64_preload_192 1484 | memcpy_variant 64, 3, 0, 0 1485 | .endfunc 1486 | 1487 | asm_function memcpy_new_line_size_64_preload_192_align_32 1488 | memcpy_variant 64, 3, 32, 0 1489 | .endfunc 1490 | 1491 | asm_function memcpy_new_line_size_64_preload_192_aligned_access 1492 | memcpy_variant 64, 3, 0, 1 1493 | .endfunc 1494 | 1495 | asm_function memcpy_new_line_size_32_preload_192 1496 | memcpy_variant 32, 6, 0, 0 1497 | .endfunc 1498 | 1499 | asm_function memcpy_new_line_size_32_preload_192_align_32 1500 | memcpy_variant 32, 6, 32, 0 1501 | .endfunc 1502 | 1503 | asm_function memcpy_new_line_size_32_preload_96 1504 | memcpy_variant 32, 3, 8, 0 1505 | .endfunc 1506 | 1507 | asm_function memcpy_new_line_size_32_preload_96_aligned_access 1508 | memcpy_variant 32, 3, 8, 1 1509 | .endfunc 1510 | 1511 | asm_function memcpy_new_neon_line_size_64 1512 | neon_memcpy_variant 64, 3, 1 1513 | .endfunc 1514 | 1515 | asm_function memcpy_new_neon_line_size_32 1516 | neon_memcpy_variant 32, 6, 1 1517 | .endfunc 1518 | 1519 | asm_function memcpy_new_neon_line_size_32_auto 1520 | neon_memcpy_variant 32, 0, 1 1521 | .endfunc 1522 | 1523 | #endif 1524 | 1525 | /* 1526 | * Macro for memset replacement. 1527 | * write_align must be 0, 8, or 32. 1528 | * use_neon must be 0 or 1. 1529 | */ 1530 | 1531 | .macro memset_variant write_align, use_neon 1532 | .if \use_neon == 1 1533 | .fpu neon 1534 | .endif 1535 | ands r3, r0, #3 1536 | mov ip, r0 1537 | bne 7f 1538 | 1539 | /* Destination is word aligned. */ 1540 | 1: orr r1, r1, r1, lsl #8 1541 | .if \use_neon == 1 1542 | cmp r2, #16 1543 | .else 1544 | cmp r2, #8 1545 | .endif 1546 | orr r1, r1, r1, lsl #16 1547 | .if \use_neon == 1 1548 | blt 13f 1549 | vmov d0, r1, r1 1550 | vmov d1, r1, r1 1551 | .else 1552 | blt 5f 1553 | mov r3, r1 1554 | .endif 1555 | 1556 | cmp r2, #64 1557 | push {r4} 1558 | .if \use_neon == 1 1559 | blt 10f 1560 | .else 1561 | ble 10f 1562 | .endif 1563 | .if \write_align > 0 1564 | ands r4, r0, #(\write_align - 1) 1565 | .if \use_neon == 1 1566 | #ifndef CONFIG_THUMB 1567 | add r3, r4, #7 1568 | #endif 1569 | .endif 1570 | /* Let r4 be equal to the number of bytes to align. */ 1571 | rsb r4, r4, #\write_align 1572 | /* 1573 | * At this point r4 contains the number of bytes to align 1574 | * if eq is not set. The eq flag is set if there are no bytes 1575 | * to align. 1576 | */ 1577 | .if \write_align == 8 1578 | subne r2, r2, r4 1579 | strne r1, [r0], #4 1580 | .elseif \write_align == 32 1581 | beq 2f 1582 | tst r4, #4 1583 | sub r2, r2, r4 1584 | strne r1, [r0], #4 1585 | .if \use_neon == 1 1586 | #ifdef CONFIG_THUMB 1587 | tst r4, #8 1588 | vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]! 1589 | cmp r4, #16 1590 | vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]! 1591 | #else 1592 | bic r4, r3, #7 1593 | lsr r4, r4, #1 1594 | add pc, pc, r4 1595 | nop 1596 | vst1.64 {d0}, [r0 NEON_ALIGN(64)]! 1597 | vst1.64 {d0}, [r0 NEON_ALIGN(64)]! 1598 | vst1.64 {d0}, [r0 NEON_ALIGN(64)]! 1599 | vst1.64 {d0}, [r0 NEON_ALIGN(64)]! 1600 | #endif 1601 | .else 1602 | tst r4, #8 1603 | stmiane r0!, {r1, r3} 1604 | cmp r4, #16 1605 | stmiage r0!, {r1, r3} 1606 | stmiage r0!, {r1, r3} 1607 | .endif 1608 | .endif /* \write_align == 32 */ 1609 | cmp r2, #64 1610 | blt 4f 1611 | .endif /* \write_align > 0 */ 1612 | 1613 | 2: 1614 | .if \use_neon == 1 1615 | /* 1616 | * When NEON is enabled, \write_align is 1617 | * equal to 32 so specify 256-bit alignment in the 1618 | * NEON store instructions. 1619 | */ 1620 | subs r2, r2, #64 1621 | vmov q1, q0 1622 | 3: vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! 1623 | subs r2, r2, #64 1624 | vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! 1625 | bge 3b 1626 | adds r2, r2, #64 1627 | .else 1628 | mov r4, r1 1629 | subs r2, r2, #64 1630 | push {r5} 1631 | mov r5, r1 1632 | 1633 | 3: stmia r0!, {r1, r3, r4, r5} 1634 | subs r2, r2, #64 /* Thumb16 */ 1635 | stmia r0!, {r1, r3, r4, r5} 1636 | stmia r0!, {r1, r3, r4, r5} 1637 | stmia r0!, {r1, r3, r4, r5} 1638 | bge 3b 1639 | adds r2, r2, #64 /* Thumb16 */ 1640 | 1641 | pop {r5} 1642 | .endif 1643 | /* Early exit if there are 0 bytes left. */ 1644 | /* THUMB( cbz r2, 9f ) */ 1645 | THUMB( cmp r2, #0 ) 1646 | THUMB( beq 9f ) 1647 | ARM( teq r2, #0 ) 1648 | ARM( beq 9f ) 1649 | /* 1650 | * Handle 8-64 bytes (or 16-63 bytes in case of NEON). 1651 | * In case of NEON, destination must be 8-byte aligned. 1652 | */ 1653 | 4: 1654 | .if \use_neon == 1 1655 | #ifdef CONFIG_THUMB 1656 | vmov q1, q0 1657 | cmp r2, #32 1658 | vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(64)]! 1659 | tst r2, #16 1660 | vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(64)]! 1661 | tst r2, #8 1662 | vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]! 1663 | and r2, r2, #7 1664 | #else 1665 | bic r4, r2, #15 1666 | subs r2, r2, r4 1667 | rsb r4, r4, #64 1668 | /* 1669 | * When using NEON, the vst instruction 1670 | * (storing 16 bytes) is always 32-bit. 1671 | */ 1672 | lsr r4, r4, #2 1673 | add pc, pc, r4 1674 | nop 1675 | vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]! 1676 | vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]! 1677 | vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]! 1678 | vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]! 1679 | cmp r2, #8 1680 | strge r1, [r0], #4 1681 | strge r1, [r0], #4 1682 | subge r2, r2, #8 1683 | #endif 1684 | .else /* use_neon == 0 */ 1685 | bic r4, r2, #7 1686 | subs r2, r2, r4 1687 | rsb r4, r4, #64 1688 | /* 1689 | * The stmia instruction (storing 8 bytes) is 32-bit for ARM, 1690 | * 16-bit for Thumb2. 1691 | */ 1692 | THUMB( lsrs r4, r4, #2 ) 1693 | ARM( lsr r4, r4, #1 ) 1694 | add pc, pc, r4 1695 | nop 1696 | stmia r0!, {r1, r3} 1697 | stmia r0!, {r1, r3} 1698 | stmia r0!, {r1, r3} 1699 | stmia r0!, {r1, r3} 1700 | stmia r0!, {r1, r3} 1701 | stmia r0!, {r1, r3} 1702 | stmia r0!, {r1, r3} 1703 | stmia r0!, {r1, r3} 1704 | .endif 1705 | 14: pop {r4} 1706 | 1707 | 5: cmp r2, #4 1708 | strge r1, [r0], #4 1709 | /* Early exit for multiple of 4 size. */ 1710 | ands r2, r2, #3 1711 | moveq r0, ip 1712 | bxeq lr 1713 | 1714 | /* 1715 | * At this point there are 1, 2 or 3 bytes, 1716 | * and the destination is aligned. 1717 | */ 1718 | 6: cmp r2, #2 1719 | strhge r1, [r0], #2 1720 | strbne r1, [r0] 1721 | mov r0, ip 1722 | bx lr 1723 | 1724 | .if \use_neon == 1 1725 | /* 0-15 bytes left, word aligned. */ 1726 | 13: cmp r2, #8 1727 | strge r1, [r0] 1728 | strge r1, [r0, #4] 1729 | addge r0, r0, #8 1730 | subge r2, r2, #8 1731 | b 5b 1732 | .endif 1733 | 1734 | /* Unaligned case. */ 1735 | 7: cmp r2, #4 1736 | blt 8f 1737 | #ifdef CONFIG_THUMB 1738 | .if \use_neon == 1 1739 | /* 1740 | * When Thumb2 is enabled with NEON, use the optimized 1741 | * unaligned NEON code path for small sizes. 1742 | */ 1743 | cmp r2, #64 1744 | blt 11f 1745 | .endif 1746 | #endif 1747 | /* Align the destination. */ 1748 | cmp r3, #2 1749 | sub r2, r2, #4 1750 | strble r1, [r0] 1751 | strble r1, [r0, #1] 1752 | addle r0, r0, #2 1753 | add r2, r2, r3 1754 | strbne r1, [r0], #1 1755 | b 1b 1756 | 1757 | /* 0 to 3 bytes left. */ 1758 | 8: cmp r2, #2 1759 | strbge r1, [r0] 1760 | strbge r1, [r0, #1] 1761 | addge r0, r0, #2 1762 | tst r2, #1 1763 | strbne r1, [r0] 1764 | mov r0, ip 1765 | bx lr 1766 | 1767 | 9: pop {r4} 1768 | mov r0, ip 1769 | bx lr 1770 | 1771 | /* 1772 | * Word aligned 8 <= size <= 64 1773 | * (16 <= size <= 63 in case of NEON). 1774 | */ 1775 | 10: 1776 | /* Align the destination to an 8 byte boundary. */ 1777 | tst r0, #4 1778 | strne r1, [r0], #4 1779 | subne r2, r2, #4 1780 | .if \use_neon == 1 1781 | cmp r2, #16 1782 | poplt {r4} 1783 | blt 13b 1784 | .else 1785 | cmp r2, #8 1786 | blt 14b 1787 | .endif 1788 | b 4b 1789 | 1790 | #ifdef CONFIG_THUMB 1791 | .if \use_neon == 1 1792 | /* 1793 | * Handle 4 <= size <= 63 bytes, unaligned. 1794 | * Use unaligned NEON instructions with Thumb2. 1795 | */ 1796 | 11: 1797 | orr r1, r1, r1, lsl #8 1798 | tst r2, #8 1799 | orr r1, r1, r1, lsl #16 1800 | vmov d0, r1, r1 1801 | vst1ne.8 {d0}, [r0]! 1802 | vmov d1, r1, r1 1803 | tst r2, #16 1804 | vst1ne.8 {d0, d1}, [r0]! 1805 | vmov q1, q0 1806 | cmp r2, #32 1807 | and r2, r2, #7 1808 | vst1ge.8 {d0-d3}, [r0]! 1809 | cmp r2, #4 1810 | /* The following store is unaligned. */ 1811 | strge r1, [r0], #4 1812 | subge r2, r2, #4 1813 | b 8b 1814 | .endif 1815 | #endif 1816 | .endm 1817 | 1818 | #if defined(MEMSET_REPLACEMENT_RPI) || defined(MEMSET_REPLACEMENT_ARMV7_32) \ 1819 | || defined(MEMSET_REPLACEMENT_ARMV7_64) || defined(MEMSET_REPLACEMENT_NEON_32) \ 1820 | || defined(MEMSET_REPLACEMENT_NEON_64) 1821 | 1822 | #ifdef MEMSET_REPLACEMENT_RPI 1823 | asm_function memset 1824 | memset_variant 32, 0 1825 | .endfunc 1826 | #endif 1827 | 1828 | #if defined(MEMSET_REPLACEMENT_ARMV7_32) || defined(MEMSET_REPLACEMENT_ARMV7_64) 1829 | asm_function memset 1830 | memset_variant 8, 0 1831 | .endfunc 1832 | #endif 1833 | 1834 | #if defined(MEMSET_REPLACEMENT_NEON_32) || defined(MEMSET_REPLACEMENT_NEON_64) 1835 | asm_function memset 1836 | memset_variant 32, 1 1837 | .endfunc 1838 | #endif 1839 | 1840 | #else 1841 | 1842 | asm_function memset_new_align_0 1843 | memset_variant 0, 0 1844 | .endfunc 1845 | 1846 | asm_function memset_new_align_8 1847 | memset_variant 8, 0 1848 | .endfunc 1849 | 1850 | asm_function memset_new_align_32 1851 | memset_variant 32, 0 1852 | .endfunc 1853 | 1854 | asm_function memset_neon 1855 | memset_variant 32, 1 1856 | .endfunc 1857 | 1858 | #endif 1859 | -------------------------------------------------------------------------------- /new_arm.h: -------------------------------------------------------------------------------- 1 | 2 | extern void *memcpy_new_line_size_64_preload_192(void *dest, 3 | const void *src, size_t n); 4 | 5 | extern void *memcpy_new_line_size_64_preload_192_align_32(void *dest, 6 | const void *src, size_t n); 7 | 8 | extern void *memcpy_new_line_size_64_preload_192_aligned_access(void *dest, 9 | const void *src, size_t n); 10 | 11 | extern void *memcpy_new_line_size_32_preload_192(void *dest, 12 | const void *src, size_t n); 13 | 14 | extern void *memcpy_new_line_size_32_preload_192_align_32(void *dest, 15 | const void *src, size_t n); 16 | 17 | extern void *memcpy_new_line_size_32_preload_96(void *dest, 18 | const void *src, size_t n); 19 | 20 | extern void *memcpy_new_line_size_32_preload_96_aligned_access(void *dest, 21 | const void *src, size_t n); 22 | 23 | extern void *memcpy_new_neon_line_size_64(void *dest, const void *src, size_t n); 24 | 25 | extern void *memcpy_new_neon_line_size_32(void *dest, const void *src, size_t n); 26 | 27 | extern void *memcpy_new_neon_line_size_32_auto(void *dest, const void *src, size_t n); 28 | 29 | extern void *memset_new_align_0(void *dest, int c, size_t size); 30 | 31 | extern void *memset_new_align_8(void *dest, int c, size_t size); 32 | 33 | extern void *memset_new_align_32(void *dest, int c, size_t size); 34 | 35 | extern void *memset_neon(void *dest, int c, size_t size); 36 | --------------------------------------------------------------------------------