├── makefile ├── tlb_test.s ├── tlb_test.c └── readme.md /makefile: -------------------------------------------------------------------------------- 1 | CFLAGS=-g -O2 -std=gnu99 2 | LDFLAGS=-static 3 | 4 | tlb_test: tlb_test.c tlb_test.s 5 | -------------------------------------------------------------------------------- /tlb_test.s: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015, Cosmin Gorgovan 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | */ 15 | 16 | .global data_test 17 | .func data_test 18 | .type data_test, %function 19 | .arm 20 | @ R0 = buffer, R1 = page_cnt, R2 = total_ops 21 | data_test: 22 | MOV R3, #0 @index 23 | 24 | r:ADD R12, R0, R3, LSL #12 25 | LDR R12, [R12, R3, LSL #4] 26 | 27 | ADD R3, #1 28 | CMP R3, R1 29 | MOVEQ R3, #0 30 | 31 | SUBS R2, #1 32 | BNE r 33 | 34 | BX LR 35 | .endfunc 36 | 37 | .global inst_test 38 | .func inst_test 39 | .type inst_test, %function 40 | .arm 41 | inst_test: 42 | SUBS R0, #1 43 | BNE inst_test+4096+8 44 | BX LR 45 | .endfunc 46 | 47 | -------------------------------------------------------------------------------- /tlb_test.c: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015, Cosmin Gorgovan 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | */ 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #define PAGE_SIZE 4096 24 | #define OP_CNT (1000*1000*1000) 25 | #define OVERSIZE 8 26 | #define MAX_SIZE 2048 27 | #define INST_OFFSET (PAGE_SIZE + 8) 28 | 29 | extern void data_test(void *buf, int page_cnt, int op_cnt); 30 | extern void *inst_test; 31 | 32 | typedef void (*itest)(uint32_t count); 33 | 34 | void help() { 35 | printf("\nSyntax: ./tlb_test [d|i] SIZE [-huge]\n" 36 | " d - dTLB test\n" 37 | " i - iTLB test\n" 38 | " SIZE - specified in 4KiB units [1...%d]\n" 39 | " -huge - allocates huge pages\n\n", MAX_SIZE); 40 | exit(EXIT_FAILURE); 41 | } 42 | 43 | void prepare_inst(void *buf, int cnt) { 44 | uint32_t *fixup; 45 | void *start_buf = buf; 46 | 47 | for (int i = 0; i < cnt; i++) { 48 | memcpy(buf, &inst_test, 12); 49 | buf += INST_OFFSET; 50 | } 51 | 52 | // Loop back to the first page 53 | fixup = ((uint32_t *)(buf - INST_OFFSET)) + 1; 54 | *fixup &= 0xFF000000; 55 | *fixup |= ((uint32_t *)start_buf - fixup - 2) & 0xFFFFFF; 56 | 57 | __clear_cache(start_buf, fixup + 3); 58 | } 59 | 60 | int main(int argc, char **argv) { 61 | int page_cnt; 62 | uint8_t *buf; 63 | int use_huge_pages = 0; 64 | int is_data_test; 65 | itest itlb_test; 66 | 67 | if (argc != 3 && argc != 4) help(); 68 | 69 | if (strcmp(argv[1], "d") == 0) { 70 | is_data_test = 1; 71 | } else if (strcmp(argv[1], "i") == 0) { 72 | is_data_test = 0; 73 | } else { 74 | help(); 75 | } 76 | 77 | page_cnt = atoi(argv[2]); 78 | if (page_cnt < 1 || page_cnt > MAX_SIZE) help(); 79 | 80 | if (argc == 4) { 81 | if (strcmp(argv[3], "-huge") == 0) { 82 | use_huge_pages = 1; 83 | } else { 84 | help(); 85 | } 86 | } 87 | 88 | buf = mmap(NULL, PAGE_SIZE * (page_cnt + OVERSIZE), 89 | PROT_READ | PROT_WRITE | (is_data_test ? 0 : PROT_EXEC), 90 | MAP_PRIVATE|MAP_ANONYMOUS|(use_huge_pages ? MAP_HUGETLB : 0), -1, 0); 91 | assert(buf != MAP_FAILED); 92 | 93 | if (is_data_test) { 94 | data_test(buf, page_cnt, OP_CNT); 95 | } else { 96 | prepare_inst(buf, page_cnt); 97 | itlb_test = (itest)buf; 98 | itlb_test(OP_CNT); 99 | } 100 | 101 | return 0; 102 | } 103 | 104 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | TLB size and huge pages on ARM cores 2 | ==================================== 3 | 4 | Context 5 | ------- 6 | 7 | In a computer using virtual memory, [page tables](https://en.wikipedia.org/wiki/Page_table) are used to map virtual addresses to physical addresses and to set the R/W/E permissions for each page. Regular page tables on ARMv7 are up to two levels deep and ARMv8 and ARMv7 [LPAE](https://en.wikipedia.org/wiki/Physical_Address_Extension) tables can be up to three levels deep. To avoid walking the page table for each memory access, the pages in use are cached in a [TLB](https://en.wikipedia.org/wiki/Translation_lookaside_buffer). Cortex-A implementations use a modified Harvard architecture (separate datapaths for instructions and data) with separate small and fast L1 TLBs and a slower and larger unified L2 TLB. 8 | 9 | 10 | Huge pages 11 | ---------- 12 | 13 | The regular page size is 4KiB on most architectures, including ARM. To reduced the TLB pressure for applications which work with large datasets or which have large / fragmented code, huge pages (on Linux, 2MiB with ARMv7 LPAE / ARMv8 or 1MiB on ARMv7 without LPAE) can be used. Large page support for LPAE-enabled systems was added in the 3.11 version of the mainline Linux kernel - [patch 1](https://github.com/torvalds/linux/commit/dde1b65110), [patch2](https://github.com/torvalds/linux/commit/0b19f9335), [patch 3](https://github.com/torvalds/linux/commit/1355e2a6) and [patch 4](https://github.com/torvalds/linux/commit/8d962507) - support for [transparent huge pages](https://lwn.net/Articles/423592/). 14 | 15 | The issue is that many ARMv7 cores from ARM don't properly support caching huge pages in their L1 TLB. To quote the [Cortex-A15 TRM](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0438i/CACCECAH.html): 16 | 17 | > If the page tables map the memory region to a larger granularity than 4K, it only allocates one mapping for the particular 4K region to which the current access corresponds. 18 | 19 | The L2 TLB generally supports huge pages, but it can also cache a high number of entries. Even with 4 KiB pages, I don't expect L2 TLB misses to cause a significant slowdown for most applications. 20 | 21 | The table below summarizes the TLB capabilities and sizes for ARM Cortex-A cores: 22 | 23 | | Core | LPAE support | L1 data TLB size | L1 data huge page support | L1 inst. TLB size | L1 inst. huge page support | L2 TLB size | L2 TLB huge page support | 24 | |------------|--------------|--------------------------|---------------------------|-------------------|----------------------------|-------------|--------------------------| 25 | | Cortex-A5 | N | 10 | ? | 10 | ? | 128 | Y | 26 | | Cortex-A7 | Y | 10 | ? | 10 | ? | 256 | Y | 27 | | Cortex-A8 | N | 32 | Y | 32 | Y | N/A | N/A | 28 | | Cortex-A9 | N | 32 | ? | 32 or 64 | ? | 4 + (64, 128, 256 or 512) | Y | 29 | | Cortex-A15 | Y | 32 (reads) + 32 (writes) | Optional 1M | 32 | N | 512 | 64K, 1M, 2M, 16M, 1G | 30 | | Cortex-A17 | Y | 32 | 1M | 32, 48 or 64 | 1M | 1024 | 64K, 1M, 2M, 16M, 1G | 31 | | Cortex-A53 | N/A (64-bit) | 10 | ? | 10 | ? | 512 | 64K, 1M, 2M, 16M, 512M (?) | 32 | | Cortex-A57 | N/A (64-bit) | 32 | 64K, 1M | 48 | 64K, 1M | 1024 | 64K, 1M, 2M, 16M, 512M, 1G | 33 | | Cortex-A72 | N/A (64-bit) | 32 | 64K, 1M | 48 | 64K, 1M | 1024 | 64K, 1M, 2M, 16M, 512M, 1G | 34 | | Cortex-A73 | N/A (64-bit) | 48 | 16K, 64K, 1M | 32 | 16K, 64K, 1M | 1024 + 128 | 1024 * (16K, 64K) + 128 * (1M, 2M, 16M, 32M, 512M, 1G) 35 | 36 | 37 | 38 | In the TRMs for A5, A7, A9 and A53 it's not clear what sizes are supported by the L1 micro TLBs. 39 | 40 | Cortex-A8 doesn't seem to have a unified TLB. 41 | 42 | None of the LPAE-enabled cores seem to support 2MB pages in the L1 TLBs. 43 | 44 | 45 | Runtime detection 46 | ----------------- 47 | 48 | Given the large number of cores with vague specifications or vendor-configurable options, I thought it would be interesting to develop a tool and a technique to determine the configuration by observing runtime behaviour, without any access to specs. 49 | 50 | I'm introducing the unimaginatively named tlb test utility, which runs on ARM GNU/Linux systems. 51 | 52 | 53 | Configurations determined using tlb test 54 | ---------------------------------------- 55 | 56 | | System | LPAE support | L1 data TLB size | L1 data huge page support | L1 inst. TLB size | L1 inst. huge page support | L2 TLB size | L2 TLB huge page support | 57 | |------------|--------------|------------------|---------------------------|-------------------|----------------------------|-------------|--------------------------| 58 | | Odroid-X2 (Exynos 4412 Prime, Cortex-A9) | N | 32 | ? | 32 | ? | 4 + 128 | Y | 59 | | Xilinx Zynq Z-7045 (Cortex-A9) | N | 32 | ? | 32 | ? | 4 + 128 | Y | 60 | | ARM Juno LITTLE core (Cortex-A53) | N/A (64-bit) | 10 | N | 10 | Y | 512 | Y | 61 | | Rockchip RK3288 (Cortex-A17) | Y | 32 | Y | 32 | Y | 1024 | Y | 62 | | Tegra K1 T124 (Cortex-A15) | Y | 32(+32?) | N | 32 | N | 512 | Y | 63 | | Tegra K1 T132 (NVIDIA Denver) | N/A (64-bit) | 256(?) | ? | 128(?) | ? | ? | ? | 64 | | APM883208 (APM X-Gene) | N/A (64-bit) | 20 | Y | 10 | N | 1024 | Y | 65 | | Tegra X2 T186 (NVIDIA Denver2) | N/A (64-bit) | 256(?) | ? | 128(?) | ? | ? | ? | 66 | 67 | Theory of operation 68 | ------------------- 69 | 70 | The basic idea is to load data (it's only testing the data TLB) from a configurable number of different pages, in quick succession, while minimising the effect of other sources of timing noise. The function *test()* in *tlb_test.s* reads *total_ops* words from *buffer*, each read incrementing the pointer by (4096 + 8) bytes, with wrap-around every *page_cnt* reads. 71 | 72 | When actively reading from more pages than the TLB size, performance will suddenly decrease. By using a buffer larger than (L1-data-TLB-size * regular-page-size) and smaller than (L1-data-TLB-size * huge-page-size) allocated using huge pages, we can determine if the L1 TLB can cache huge pages. 73 | 74 | 75 | Example 76 | ------- 77 | 78 | As an example and sanity check, I've ran *tlb_test* on a Tegra K1 (Cortex-A15) SoC. I'm using [perf](https://perf.wiki.kernel.org/index.php/Main_Page) to confirm the causes of overhead. It's not strictly required and just execution time is good enough to use tlb_test. First, let's confirm the size of the L1 data TLB: 79 | 80 | ``` 81 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 16 82 | 83 | Performance counter stats for './tlb_test 16': 84 | 85 | 7,001,546,456 instructions # 0.00 insns per cycle 86 | 1,000,544,697 L1-dcache-loads 87 | 45,560 L1-dcache-load-misses # 0.00% of all L1-dcache hits 88 | 50,669 dTLB-load-misses 89 | 90 | 2.368960939 seconds time elapse 91 | 92 | 93 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 32 94 | 95 | Performance counter stats for './tlb_test 32': 96 | 97 | 7,001,545,342 instructions # 0.00 insns per cycle 98 | 1,000,545,046 L1-dcache-loads 99 | 45,619 L1-dcache-load-misses # 0.00% of all L1-dcache hits 100 | 70,292 dTLB-load-misses 101 | 102 | 2.362317209 seconds time elapsed 103 | 104 | 105 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 33 106 | 107 | Performance counter stats for './tlb_test 33': 108 | 109 | 7,001,525,903 instructions # 0.00 insns per cycle 110 | 1,000,538,344 L1-dcache-loads 111 | 44,769 L1-dcache-load-misses # 0.00% of all L1-dcache hits 112 | 48,294,461 dTLB-load-misses 113 | 114 | 2.416565829 seconds time elapsed 115 | 116 | 117 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 40 118 | 119 | Performance counter stats for './tlb_test 40': 120 | 121 | 7,002,175,820 instructions # 0.00 insns per cycle 122 | 1,000,795,312 L1-dcache-loads 123 | 69,207 L1-dcache-load-misses # 0.01% of all L1-dcache hits 124 | 956,369,999 dTLB-load-misses 125 | 126 | 3.886506385 seconds time elapsed 127 | ``` 128 | 129 | 130 | Starting with 33 pages, the number of dTLB misses increases dramatically. Even without perf, we could easily deduce the TLB size is 32 using the timing information. 131 | 132 | Next, let's confirm the size of the L2 TLB: 133 | 134 | ``` 135 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 48 136 | 137 | Performance counter stats for './tlb_test 48': 138 | 139 | 7,002,547,418 instructions # 0.00 insns per cycle 140 | 1,000,921,383 L1-dcache-loads 141 | 86,844 L1-dcache-load-misses # 0.01% of all L1-dcache hits 142 | 1,000,052,404 dTLB-load-misses 143 | 144 | 4.002053526 seconds time elapsed 145 | 146 | 147 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 256 148 | 149 | Performance counter stats for './tlb_test 256': 150 | 151 | 7,003,012,833 instructions # 0.00 insns per cycle 152 | 1,001,087,251 L1-dcache-loads 153 | 120,170 L1-dcache-load-misses # 0.01% of all L1-dcache hits 154 | 1,000,078,542 dTLB-load-misses 155 | 156 | 3.995740584 seconds time elapsed 157 | 158 | 159 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 512 160 | 161 | Performance counter stats for './tlb_test 512': 162 | 163 | 7,002,650,367 instructions # 0.00 insns per cycle 164 | 1,000,948,480 L1-dcache-loads 165 | 140,842 L1-dcache-load-misses # 0.01% of all L1-dcache hits 166 | 1,000,061,642 dTLB-load-misses 167 | 168 | 4.002590827 seconds time elapsed 169 | 170 | 171 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 513 172 | 173 | Performance counter stats for './tlb_test 513': 174 | 175 | 7,002,650,847 instructions # 0.00 insns per cycle 176 | 1,000,938,797 L1-dcache-loads 177 | 127,003 L1-dcache-load-misses # 0.01% of all L1-dcache hits 178 | 1,000,060,931 dTLB-load-misses 179 | 180 | 4.037016451 seconds time elapsed 181 | 182 | 183 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 520 184 | 185 | Performance counter stats for './tlb_test 520': 186 | 187 | 7,003,596,287 instructions # 0.00 insns per cycle 188 | 1,001,290,282 L1-dcache-loads 189 | 179,866 L1-dcache-load-misses # 0.02% of all L1-dcache hits 190 | 1,000,092,440 dTLB-load-misses 191 | 192 | 4.549982772 seconds time elapsed 193 | 194 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 1024 195 | 196 | Performance counter stats for './tlb_test 1024': 197 | 198 | 7,008,804,035 instructions # 0.00 insns per cycle 199 | 1,003,187,234 L1-dcache-loads 200 | 465,421 L1-dcache-load-misses # 0.05% of all L1-dcache hits 201 | 1,000,212,790 dTLB-load-misses 202 | 203 | 16.194994147 seconds time elapsed 204 | ``` 205 | 206 | Note how the performance with 48, 256 and 512 pages is practically identical, while with 520 it is significantly slower. When the data size is double the L2 TLB size (1024 pages), execution slows down by a factor of 4x. 207 | 208 | Now let's see if using huge pages can help. First, when the data size would fit in the L2 TLB using normal pages, but not in the L1 TLB: 209 | 210 | ``` 211 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 40 -huge 212 | 213 | Performance counter stats for './tlb_test 40 -huge': 214 | 215 | 7,003,023,486 instructions # 0.00 insns per cycle 216 | 1,000,989,951 L1-dcache-loads 217 | 149,260 L1-dcache-load-misses # 0.01% of all L1-dcache hits 218 | 960,840,312 dTLB-load-misses 219 | 220 | 3.917991822 seconds time elapsed 221 | ``` 222 | 223 | Esentially no difference in execution time, a good confirmation that the L1 TLB only caches 4KiB out of a huge page (arg, ARM, whyyyy?!?). 224 | 225 | And let's also test huge pages on the L2 TLB, which should work: 226 | 227 | ``` 228 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 520 -huge 229 | 230 | Performance counter stats for './tlb_test 520 -huge': 231 | 232 | 7,003,126,596 instructions # 0.00 insns per cycle 233 | 1,000,893,472 L1-dcache-loads 234 | 27,634,720 L1-dcache-load-misses # 2.76% of all L1-dcache hits 235 | 1,000,077,382 dTLB-load-misses 236 | 237 | 3.874247451 seconds time elapsed 238 | 239 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 1024 -huge 240 | 241 | Performance counter stats for './tlb_test 1024 -huge': 242 | 243 | 7,004,239,868 instructions # 0.00 insns per cycle 244 | 1,001,151,686 L1-dcache-loads 245 | 1,000,103,661 L1-dcache-load-misses # 99.90% of all L1-dcache hits 246 | 1,000,126,224 dTLB-load-misses 247 | 248 | 5.313755445 seconds time elapsed 249 | ``` 250 | 251 | Success, huge pages are indeed cached correctly by the L2 TLB. Note that now almost all loads miss in the L1 data cache. We're loading 1024 words, each in a separate (64 byte) cache line. The L1 cache on this core is only 32 KiB in size, therefore we're completely thrashing it at each iteration. When using 4KiB pages this is not an issue because our virtual memory is uninitialized, so all pages point to the same physical zero page and the L1 data cache is [physically indexed, physically tagged](https://en.wikipedia.org/wiki/CPU_cache#Address_translation). 252 | 253 | --------------------------------------------------------------------------------