├── makefile
├── tlb_test.s
├── tlb_test.c
└── readme.md


/makefile:
--------------------------------------------------------------------------------
1 | CFLAGS=-g -O2 -std=gnu99
2 | LDFLAGS=-static
3 | 
4 | tlb_test: tlb_test.c tlb_test.s
5 | 


--------------------------------------------------------------------------------
/tlb_test.s:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright (c) 2015, Cosmin Gorgovan
 3 |   All rights reserved.
 4 | 
 5 |   Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 6 | 
 7 |   1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 8 | 
 9 |   2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10 | 
11 |   3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
12 | 
13 |   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14 | */
15 | 
16 | .global data_test
17 | .func data_test
18 | .type data_test, %function
19 | .arm
20 | @ R0 = buffer, R1 = page_cnt, R2 = total_ops
21 | data_test:
22 |   MOV R3, #0 @index
23 | 
24 | r:ADD R12, R0, R3, LSL #12
25 |   LDR R12, [R12, R3, LSL #4]
26 | 
27 |   ADD R3, #1
28 |   CMP R3, R1
29 |   MOVEQ R3, #0
30 | 
31 |   SUBS R2, #1
32 |   BNE r
33 | 
34 |   BX LR
35 | .endfunc
36 | 
37 | .global inst_test
38 | .func inst_test
39 | .type inst_test, %function
40 | .arm
41 | inst_test:
42 |   SUBS R0, #1
43 |   BNE inst_test+4096+8
44 |   BX LR
45 | .endfunc
46 | 
47 | 


--------------------------------------------------------------------------------
/tlb_test.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Copyright (c) 2015, Cosmin Gorgovan
  3 |   All rights reserved.
  4 | 
  5 |   Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
  6 | 
  7 |   1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
  8 | 
  9 |   2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 10 | 
 11 |   3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 12 | 
 13 |   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 14 | */
 15 | 
 16 | #include <stdio.h>
 17 | #include <stdlib.h>
 18 | #include <assert.h>
 19 | #include <sys/mman.h>
 20 | #include <stdint.h>
 21 | #include <string.h>
 22 | 
 23 | #define PAGE_SIZE 4096
 24 | #define OP_CNT (1000*1000*1000)
 25 | #define OVERSIZE 8
 26 | #define MAX_SIZE 2048
 27 | #define INST_OFFSET (PAGE_SIZE + 8)
 28 | 
 29 | extern void data_test(void *buf, int page_cnt, int op_cnt);
 30 | extern void *inst_test;
 31 | 
 32 | typedef void (*itest)(uint32_t count);
 33 | 
 34 | void help() {
 35 |   printf("\nSyntax: ./tlb_test [d|i] SIZE [-huge]\n"
 36 |          "  d      - dTLB test\n"
 37 |          "  i      - iTLB test\n"
 38 |          "  SIZE   - specified in 4KiB units [1...%d]\n"
 39 |          "  -huge  - allocates huge pages\n\n", MAX_SIZE);
 40 |   exit(EXIT_FAILURE);
 41 | }
 42 | 
 43 | void prepare_inst(void *buf, int cnt) {
 44 |   uint32_t *fixup;
 45 |   void *start_buf = buf;
 46 | 
 47 |   for (int i = 0; i < cnt; i++) {
 48 |     memcpy(buf, &inst_test, 12);
 49 |     buf += INST_OFFSET;
 50 |   }
 51 | 
 52 |   // Loop back to the first page
 53 |   fixup = ((uint32_t *)(buf - INST_OFFSET)) + 1;
 54 |   *fixup &= 0xFF000000;
 55 |   *fixup |= ((uint32_t *)start_buf - fixup - 2) & 0xFFFFFF;
 56 | 
 57 |   __clear_cache(start_buf, fixup + 3);
 58 | }
 59 | 
 60 | int main(int argc, char **argv) {
 61 |   int page_cnt;
 62 |   uint8_t *buf;
 63 |   int use_huge_pages = 0;
 64 |   int is_data_test;
 65 |   itest itlb_test;
 66 | 
 67 |   if (argc != 3 && argc != 4) help();
 68 | 
 69 |   if (strcmp(argv[1], "d") == 0) {
 70 |     is_data_test = 1;
 71 |   } else if (strcmp(argv[1], "i") == 0) {
 72 |     is_data_test = 0;
 73 |   } else {
 74 |     help();
 75 |   }
 76 | 
 77 |   page_cnt = atoi(argv[2]);
 78 |   if (page_cnt < 1 || page_cnt > MAX_SIZE) help();
 79 | 
 80 |   if (argc == 4) {
 81 |     if (strcmp(argv[3], "-huge") == 0) {
 82 |       use_huge_pages = 1;
 83 |     } else {
 84 |       help();
 85 |     }
 86 |   }
 87 | 
 88 |   buf = mmap(NULL, PAGE_SIZE * (page_cnt + OVERSIZE),
 89 |              PROT_READ | PROT_WRITE | (is_data_test ? 0 : PROT_EXEC),
 90 |              MAP_PRIVATE|MAP_ANONYMOUS|(use_huge_pages ? MAP_HUGETLB : 0), -1, 0);
 91 |   assert(buf != MAP_FAILED);
 92 | 
 93 |   if (is_data_test) {
 94 |     data_test(buf, page_cnt, OP_CNT);
 95 |   } else {
 96 |     prepare_inst(buf, page_cnt);
 97 |     itlb_test = (itest)buf;
 98 |     itlb_test(OP_CNT);
 99 |   }
100 | 
101 |   return 0;
102 | }
103 | 
104 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | TLB size and huge pages on ARM cores
  2 | ====================================
  3 | 
  4 | Context
  5 | -------
  6 | 
  7 | In a computer using virtual memory, [page tables](https://en.wikipedia.org/wiki/Page_table) are used to map virtual addresses to physical addresses and to set the R/W/E permissions for each page. Regular page tables on ARMv7 are up to two levels deep and ARMv8 and ARMv7 [LPAE](https://en.wikipedia.org/wiki/Physical_Address_Extension) tables can be up to three levels deep. To avoid walking the page table for each memory access, the pages in use are cached in a [TLB](https://en.wikipedia.org/wiki/Translation_lookaside_buffer). Cortex-A implementations use a modified Harvard architecture (separate datapaths for instructions and data) with separate small and fast L1 TLBs and a slower and larger unified L2 TLB.
  8 | 
  9 | 
 10 | Huge pages
 11 | ----------
 12 | 
 13 | The regular page size is 4KiB on most architectures, including ARM. To reduced the TLB pressure for applications which work with large datasets or which have large / fragmented code, huge pages (on Linux, 2MiB with ARMv7 LPAE / ARMv8 or 1MiB on ARMv7 without LPAE) can be used. Large page support for LPAE-enabled systems was added in the 3.11 version of the mainline Linux kernel - [patch 1](https://github.com/torvalds/linux/commit/dde1b65110), [patch2](https://github.com/torvalds/linux/commit/0b19f9335), [patch 3](https://github.com/torvalds/linux/commit/1355e2a6) and [patch 4](https://github.com/torvalds/linux/commit/8d962507) - support for [transparent huge pages](https://lwn.net/Articles/423592/).
 14 | 
 15 | The issue is that many ARMv7 cores from ARM don't properly support caching huge pages in their L1 TLB. To quote the [Cortex-A15 TRM](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0438i/CACCECAH.html):
 16 | 
 17 | > If the page tables map the memory region to a larger granularity than 4K, it only allocates one mapping for the particular 4K region to which the current access corresponds.
 18 | 
 19 | The L2 TLB generally supports huge pages, but it can also cache a high number of entries. Even with 4 KiB pages, I don't expect L2 TLB misses to cause a significant slowdown for most applications.
 20 | 
 21 | The table below summarizes the TLB capabilities and sizes for ARM Cortex-A cores:
 22 | 
 23 | | Core       | LPAE support | L1 data TLB size         | L1 data huge page support | L1 inst. TLB size | L1 inst. huge page support | L2 TLB size | L2 TLB huge page support |
 24 | |------------|--------------|--------------------------|---------------------------|-------------------|----------------------------|-------------|--------------------------|
 25 | | Cortex-A5  | N            | 10                       | ?            | 10           | ?            | 128                       | Y   |
 26 | | Cortex-A7  | Y            | 10                       | ?            | 10           | ?            | 256                       | Y   |
 27 | | Cortex-A8  | N            | 32                       | Y            | 32           | Y            | N/A                       | N/A |
 28 | | Cortex-A9  | N            | 32                       | ?            | 32 or 64     | ?            | 4 + (64, 128, 256 or 512) | Y   |
 29 | | Cortex-A15 | Y            | 32 (reads) + 32 (writes) | Optional 1M  | 32           | N            | 512                       | 64K, 1M, 2M, 16M, 1G       |
 30 | | Cortex-A17 | Y            | 32                       | 1M           | 32, 48 or 64 | 1M           | 1024                      | 64K, 1M, 2M, 16M, 1G       |
 31 | | Cortex-A53 | N/A (64-bit) | 10                       | ?            | 10           | ?            | 512                       | 64K, 1M, 2M, 16M, 512M (?) |
 32 | | Cortex-A57 | N/A (64-bit) | 32                       | 64K, 1M      | 48           | 64K, 1M      | 1024                      | 64K, 1M, 2M, 16M, 512M, 1G |
 33 | | Cortex-A72 | N/A (64-bit) | 32                       | 64K, 1M      | 48           | 64K, 1M      | 1024                      | 64K, 1M, 2M, 16M, 512M, 1G |
 34 | | Cortex-A73 | N/A (64-bit) | 48                       | 16K, 64K, 1M | 32           | 16K, 64K, 1M | 1024 + 128                | 1024 * (16K, 64K) + 128 * (1M, 2M, 16M, 32M, 512M, 1G)
 35 | 
 36 | 
 37 | 
 38 | In the TRMs for A5, A7, A9 and A53 it's not clear what sizes are supported by the L1 micro TLBs.
 39 | 
 40 | Cortex-A8 doesn't seem to have a unified TLB.
 41 | 
 42 | None of the LPAE-enabled cores seem to support 2MB pages in the L1 TLBs.
 43 | 
 44 | 
 45 | Runtime detection
 46 | -----------------
 47 | 
 48 | Given the large number of cores with vague specifications or vendor-configurable options, I thought it would be interesting to develop a tool and a technique to determine the configuration by observing runtime behaviour, without any access to specs.
 49 | 
 50 | I'm introducing the unimaginatively named tlb test utility, which runs on ARM GNU/Linux systems.
 51 | 
 52 | 
 53 | Configurations determined using tlb test
 54 | ----------------------------------------
 55 | 
 56 | | System | LPAE support | L1 data TLB size | L1 data huge page support | L1 inst. TLB size | L1 inst. huge page support | L2 TLB size | L2 TLB huge page support |
 57 | |------------|--------------|------------------|---------------------------|-------------------|----------------------------|-------------|--------------------------|
 58 | | Odroid-X2 (Exynos 4412 Prime, Cortex-A9) | N            | 32     | ? | 32     | ? | 4 + 128 | Y |
 59 | | Xilinx Zynq Z-7045 (Cortex-A9)           | N            | 32     | ? | 32     | ? | 4 + 128 | Y |
 60 | | ARM Juno LITTLE core (Cortex-A53)        | N/A (64-bit) | 10     | N | 10     | Y | 512     | Y |
 61 | | Rockchip RK3288 (Cortex-A17)             | Y            | 32     | Y | 32     | Y | 1024    | Y |
 62 | | Tegra K1 T124 (Cortex-A15)               | Y          | 32(+32?) | N | 32     | N | 512     | Y |
 63 | | Tegra K1 T132 (NVIDIA Denver)            | N/A (64-bit) | 256(?) | ? | 128(?) | ? | ?       | ? |
 64 | | APM883208 (APM X-Gene)                   | N/A (64-bit) | 20     | Y | 10     | N | 1024    | Y |
 65 | | Tegra X2 T186 (NVIDIA Denver2)           | N/A (64-bit) | 256(?) | ? | 128(?) | ? | ?       | ? |
 66 | 
 67 | Theory of operation
 68 | -------------------
 69 | 
 70 | The basic idea is to load data (it's only testing the data TLB) from a configurable number of different pages, in quick succession, while minimising the effect of other sources of timing noise. The function *test()* in *tlb_test.s* reads *total_ops* words from *buffer*, each read incrementing the pointer by (4096 + 8) bytes, with wrap-around every *page_cnt* reads.
 71 | 
 72 | When actively reading from more pages than the TLB size, performance will suddenly decrease. By using a buffer larger than (L1-data-TLB-size * regular-page-size) and smaller than (L1-data-TLB-size * huge-page-size) allocated using huge pages, we can determine if the L1 TLB can cache huge pages.
 73 | 
 74 | 
 75 | Example
 76 | -------
 77 | 
 78 | As an example and sanity check, I've ran *tlb_test* on a Tegra K1 (Cortex-A15) SoC. I'm using [perf](https://perf.wiki.kernel.org/index.php/Main_Page) to confirm the causes of overhead. It's not strictly required and just execution time is good enough to use tlb_test. First, let's confirm the size of the L1 data TLB:
 79 | 
 80 | ```
 81 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 16
 82 | 
 83 |  Performance counter stats for './tlb_test 16':
 84 | 
 85 |      7,001,546,456 instructions              #    0.00  insns per cycle        
 86 |      1,000,544,697 L1-dcache-loads                                             
 87 |             45,560 L1-dcache-load-misses     #    0.00% of all L1-dcache hits  
 88 |             50,669 dTLB-load-misses                                            
 89 | 
 90 |        2.368960939 seconds time elapse
 91 | 
 92 | 
 93 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 32
 94 | 
 95 |  Performance counter stats for './tlb_test 32':
 96 | 
 97 |      7,001,545,342 instructions              #    0.00  insns per cycle        
 98 |      1,000,545,046 L1-dcache-loads                                             
 99 |             45,619 L1-dcache-load-misses     #    0.00% of all L1-dcache hits  
100 |             70,292 dTLB-load-misses                                            
101 | 
102 |        2.362317209 seconds time elapsed
103 | 
104 | 
105 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 33
106 | 
107 |  Performance counter stats for './tlb_test 33':
108 | 
109 |      7,001,525,903 instructions              #    0.00  insns per cycle        
110 |      1,000,538,344 L1-dcache-loads                                             
111 |             44,769 L1-dcache-load-misses     #    0.00% of all L1-dcache hits  
112 |         48,294,461 dTLB-load-misses                                            
113 | 
114 |        2.416565829 seconds time elapsed
115 | 
116 | 
117 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 40
118 | 
119 |  Performance counter stats for './tlb_test 40':
120 | 
121 |      7,002,175,820 instructions              #    0.00  insns per cycle        
122 |      1,000,795,312 L1-dcache-loads                                             
123 |             69,207 L1-dcache-load-misses     #    0.01% of all L1-dcache hits  
124 |        956,369,999 dTLB-load-misses                                            
125 | 
126 |        3.886506385 seconds time elapsed
127 | ```
128 | 
129 | 
130 | Starting with 33 pages, the number of dTLB misses increases dramatically. Even without perf, we could easily deduce the TLB size is 32 using the timing information.
131 | 
132 | Next, let's confirm the size of the L2 TLB:
133 | 
134 | ```
135 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 48
136 | 
137 |  Performance counter stats for './tlb_test 48':
138 | 
139 |      7,002,547,418 instructions              #    0.00  insns per cycle        
140 |      1,000,921,383 L1-dcache-loads                                             
141 |             86,844 L1-dcache-load-misses     #    0.01% of all L1-dcache hits  
142 |      1,000,052,404 dTLB-load-misses                                            
143 | 
144 |        4.002053526 seconds time elapsed
145 | 
146 | 
147 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 256
148 | 
149 |  Performance counter stats for './tlb_test 256':
150 | 
151 |      7,003,012,833 instructions              #    0.00  insns per cycle        
152 |      1,001,087,251 L1-dcache-loads                                             
153 |            120,170 L1-dcache-load-misses     #    0.01% of all L1-dcache hits  
154 |      1,000,078,542 dTLB-load-misses                                            
155 | 
156 |        3.995740584 seconds time elapsed
157 | 
158 | 
159 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 512
160 | 
161 |  Performance counter stats for './tlb_test 512':
162 | 
163 |      7,002,650,367 instructions              #    0.00  insns per cycle        
164 |      1,000,948,480 L1-dcache-loads                                             
165 |            140,842 L1-dcache-load-misses     #    0.01% of all L1-dcache hits  
166 |      1,000,061,642 dTLB-load-misses                                            
167 | 
168 |        4.002590827 seconds time elapsed
169 | 
170 | 
171 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 513
172 | 
173 |  Performance counter stats for './tlb_test 513':
174 | 
175 |      7,002,650,847 instructions              #    0.00  insns per cycle        
176 |      1,000,938,797 L1-dcache-loads                                             
177 |            127,003 L1-dcache-load-misses     #    0.01% of all L1-dcache hits  
178 |      1,000,060,931 dTLB-load-misses                                            
179 | 
180 |        4.037016451 seconds time elapsed
181 |    
182 |        
183 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 520
184 | 
185 |  Performance counter stats for './tlb_test 520':
186 | 
187 |      7,003,596,287 instructions              #    0.00  insns per cycle        
188 |      1,001,290,282 L1-dcache-loads                                             
189 |            179,866 L1-dcache-load-misses     #    0.02% of all L1-dcache hits  
190 |      1,000,092,440 dTLB-load-misses                                            
191 | 
192 |        4.549982772 seconds time elapsed
193 | 
194 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 1024
195 | 
196 |  Performance counter stats for './tlb_test 1024':
197 | 
198 |      7,008,804,035 instructions              #    0.00  insns per cycle        
199 |      1,003,187,234 L1-dcache-loads                                             
200 |            465,421 L1-dcache-load-misses     #    0.05% of all L1-dcache hits  
201 |      1,000,212,790 dTLB-load-misses                                            
202 | 
203 |       16.194994147 seconds time elapsed
204 | ```
205 | 
206 | Note how the performance with 48, 256 and 512 pages is practically identical, while with 520 it is significantly slower. When the data size is double the L2 TLB size (1024 pages), execution slows down by a factor of 4x.
207 | 
208 | Now let's see if using huge pages can help. First, when the data size would fit in the L2 TLB using normal pages, but not in the L1 TLB:
209 | 
210 | ```
211 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 40 -huge
212 | 
213 |  Performance counter stats for './tlb_test 40 -huge':
214 | 
215 |      7,003,023,486 instructions              #    0.00  insns per cycle        
216 |      1,000,989,951 L1-dcache-loads                                             
217 |            149,260 L1-dcache-load-misses     #    0.01% of all L1-dcache hits  
218 |        960,840,312 dTLB-load-misses                                            
219 | 
220 |        3.917991822 seconds time elapsed
221 | ```
222 | 
223 | Esentially no difference in execution time, a good confirmation that the L1 TLB only caches 4KiB out of a huge page (arg, ARM, whyyyy?!?).
224 | 
225 | And let's also test huge pages on the L2 TLB, which should work:
226 | 
227 | ```
228 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 520 -huge
229 | 
230 |  Performance counter stats for './tlb_test 520 -huge':
231 | 
232 |      7,003,126,596 instructions              #    0.00  insns per cycle        
233 |      1,000,893,472 L1-dcache-loads                                             
234 |         27,634,720 L1-dcache-load-misses     #    2.76% of all L1-dcache hits  
235 |      1,000,077,382 dTLB-load-misses                                            
236 | 
237 |        3.874247451 seconds time elapsed
238 | 
239 | $ perf stat -e instructions,L1-dcache-loads,L1-dcache-load-misses,dTLB-load-misses ./tlb_test 1024 -huge
240 | 
241 |  Performance counter stats for './tlb_test 1024 -huge':
242 | 
243 |      7,004,239,868 instructions              #    0.00  insns per cycle        
244 |      1,001,151,686 L1-dcache-loads                                             
245 |      1,000,103,661 L1-dcache-load-misses     #   99.90% of all L1-dcache hits  
246 |      1,000,126,224 dTLB-load-misses                                            
247 | 
248 |        5.313755445 seconds time elapsed
249 | ```
250 | 
251 | Success, huge pages are indeed cached correctly by the L2 TLB. Note that now almost all loads miss in the L1 data cache. We're loading 1024 words, each in a separate (64 byte) cache line. The L1 cache on this core is only 32 KiB in size, therefore we're completely thrashing it at each iteration. When using 4KiB pages this is not an issue because our virtual memory is uninitialized, so all pages point to the same physical zero page and the L1 data cache is [physically indexed, physically tagged](https://en.wikipedia.org/wiki/CPU_cache#Address_translation).
252 | 
253 | 


--------------------------------------------------------------------------------