├── .github └── workflows │ └── publish.yml ├── .gitignore ├── README.md ├── data ├── firestorm │ ├── btb.csv │ ├── cache.txt │ ├── lsu.csv │ └── ras.csv ├── geekerwan │ ├── a14_a16_BV1gm4y157No.png │ ├── a16_a17_BV1gm4y157No.png │ ├── a17_BV1gm4y157No.png │ ├── a17_ecore_BV1gm4y157No.png │ └── m2_m3_BV1NJ4m1w7zk.png ├── golden_cove │ ├── bob.csv │ ├── ras.csv │ ├── rf.csv │ ├── rob.csv │ └── sched.csv └── zen2 │ └── lsu.csv ├── docs ├── 3a6000.d2 ├── 3a6000.md ├── Makefile ├── ampere_one.d2 ├── ampere_one.md ├── avalanche.d2 ├── avalanche.md ├── cbp.md ├── comparison.md ├── cortex_a75.d2 ├── cortex_a75.md ├── cortex_a77.d2 ├── cortex_a77.md ├── cortex_x1.d2 ├── cortex_x1.md ├── cortex_x2.d2 ├── cortex_x2.md ├── cortex_x3.d2 ├── cortex_x3.md ├── cortex_x4.d2 ├── cortex_x4.md ├── cortex_x925.d2 ├── cortex_x925.md ├── crestmont.d2 ├── crestmont.md ├── dieshot.md ├── firestorm.d2 ├── firestorm.md ├── golden_cove.d2 ├── golden_cove.md ├── gracemont.d2 ├── gracemont.md ├── index.md ├── lion_cove.d2 ├── lion_cove.md ├── m3_pcore.d2 ├── m3_pcore.md ├── m4_pcore.d2 ├── m4_pcore.md ├── main.py ├── neoverse_n2.md ├── neoverse_v2.d2 ├── neoverse_v2.md ├── oryon.d2 ├── oryon.md ├── p550.d2 ├── p550.md ├── p870.d2 ├── p870.md ├── redwood_cove.d2 ├── redwood_cove.md ├── skylake.d2 ├── skylake.md ├── skymont.d2 ├── skymont.md ├── sunny_cove.d2 ├── sunny_cove.md ├── uarch.csv ├── xiaomi.d2 ├── xiaomi.md ├── zen1.d2 ├── zen1.md ├── zen2.d2 ├── zen2.md ├── zen3.d2 ├── zen3.md ├── zen4.d2 ├── zen4.md ├── zen5.d2 └── zen5.md ├── main.py ├── mkdocs.yml ├── poetry.lock └── pyproject.toml /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: publish 2 | on: 3 | push: 4 | branches: 5 | - master 6 | permissions: 7 | contents: write 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | with: 14 | fetch-depth: 0 # for git history 15 | - uses: actions/cache@v4 16 | with: 17 | key: ${{ github.ref }} 18 | path: .cache 19 | - uses: Gr1N/setup-poetry@v9 20 | - run: curl -fsSL https://d2lang.com/install.sh | sh -s -- 21 | - run: poetry install 22 | - run: cd docs && make 23 | - run: poetry run mkdocs gh-deploy --force 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .cache/ 2 | site/ 3 | *.svg 4 | __pycache__ 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CPU Microarchitecture Diagrams 2 | 3 | See to find microarchitecture diagrams of several CPUs. 4 | -------------------------------------------------------------------------------- /data/firestorm/btb.csv: -------------------------------------------------------------------------------- 1 | size,stride,min,avg,max 2 | 1,4,2.55,2.59,2.62 3 | 2,4,1.80,1.81,1.81 4 | 3,4,1.53,1.54,1.56 5 | 4,4,1.40,1.40,1.41 6 | 5,4,1.32,1.32,1.33 7 | 6,4,1.27,1.27,1.27 8 | 7,4,1.23,1.23,1.23 9 | 8,4,1.20,1.20,1.21 10 | 9,4,1.18,1.18,1.18 11 | 10,4,1.16,1.16,1.17 12 | 11,4,1.15,1.15,1.15 13 | 12,4,1.13,1.13,1.14 14 | 13,4,1.12,1.12,1.12 15 | 14,4,1.11,1.12,1.12 16 | 15,4,1.11,1.11,1.11 17 | 16,4,1.10,1.10,1.10 18 | 17,4,1.10,1.10,1.10 19 | 19,4,1.09,1.27,1.63 20 | 20,4,1.08,1.08,1.08 21 | 21,4,1.08,1.08,1.08 22 | 24,4,1.07,1.07,1.07 23 | 25,4,1.06,1.08,1.09 24 | 26,4,1.06,1.06,1.06 25 | 30,4,1.05,1.05,1.06 26 | 31,4,1.05,1.05,1.05 27 | 32,4,1.05,1.05,1.05 28 | 33,4,1.05,1.05,1.05 29 | 39,4,1.04,1.04,1.04 30 | 40,4,1.04,1.04,1.04 31 | 41,4,1.04,1.04,1.04 32 | 49,4,1.03,1.03,1.03 33 | 50,4,1.03,1.03,1.03 34 | 51,4,1.03,1.03,1.03 35 | 62,4,1.03,1.03,1.03 36 | 63,4,1.03,1.31,1.50 37 | 64,4,1.02,1.03,1.03 38 | 65,4,1.02,1.02,1.02 39 | 80,4,1.02,1.02,1.02 40 | 81,4,1.02,1.02,1.02 41 | 82,4,1.02,1.02,1.02 42 | 101,4,1.02,1.02,1.02 43 | 102,4,1.02,1.02,1.02 44 | 103,4,1.02,1.02,1.02 45 | 127,4,1.01,1.01,1.01 46 | 128,4,1.01,1.01,1.02 47 | 129,4,1.01,1.01,1.01 48 | 160,4,1.01,1.01,1.01 49 | 161,4,1.01,1.02,1.05 50 | 162,4,1.01,1.01,1.02 51 | 202,4,1.01,1.01,1.01 52 | 203,4,1.01,1.01,1.01 53 | 204,4,1.01,1.01,1.01 54 | 255,4,1.01,1.01,1.01 55 | 256,4,1.01,1.02,1.05 56 | 257,4,1.01,1.01,1.01 57 | 322,4,1.01,1.01,1.01 58 | 323,4,1.01,1.01,1.01 59 | 324,4,1.01,1.01,1.04 60 | 406,4,1.00,1.00,1.00 61 | 407,4,1.00,1.00,1.00 62 | 408,4,1.00,1.00,1.00 63 | 511,4,1.00,1.01,1.03 64 | 512,4,1.00,1.00,1.00 65 | 513,4,1.00,1.00,1.00 66 | 644,4,1.00,1.01,1.05 67 | 645,4,1.00,1.00,1.00 68 | 646,4,1.00,1.00,1.00 69 | 812,4,1.00,1.00,1.00 70 | 813,4,1.00,1.00,1.01 71 | 814,4,1.00,1.00,1.00 72 | 1023,4,1.00,1.00,1.00 73 | 1024,4,1.00,1.00,1.00 74 | 1025,4,1.00,1.00,1.00 75 | 1289,4,1.84,1.84,1.85 76 | 1290,4,1.85,1.85,1.85 77 | 1291,4,1.85,1.86,1.87 78 | 1624,4,2.91,2.91,2.91 79 | 1625,4,2.91,2.91,2.92 80 | 1626,4,2.92,2.92,2.93 81 | 2046,4,2.88,2.88,2.89 82 | 2047,4,2.88,2.88,2.89 83 | 2048,4,2.88,2.89,2.89 84 | 2049,4,2.88,2.89,2.92 85 | 2579,4,2.88,2.89,2.89 86 | 2580,4,2.89,2.89,2.89 87 | 2581,4,2.89,2.89,2.91 88 | 3250,4,2.88,2.89,2.89 89 | 3251,4,2.88,2.88,2.89 90 | 3252,4,2.88,2.88,2.89 91 | 4095,4,2.89,2.89,2.89 92 | 4096,4,2.88,2.88,2.89 93 | 4097,4,2.88,2.89,2.89 94 | 5160,4,3.00,3.00,3.00 95 | 5161,4,3.00,3.00,3.00 96 | 5162,4,3.00,3.01,3.01 97 | 6501,4,2.96,2.97,2.97 98 | 6502,4,2.96,2.96,2.97 99 | 6503,4,2.96,2.97,2.97 100 | 8191,4,3.00,3.00,3.01 101 | 8192,4,3.00,3.01,3.02 102 | 8193,4,3.00,3.00,3.01 103 | 10320,4,3.00,3.01,3.02 104 | 10321,4,3.00,3.00,3.01 105 | 10322,4,3.00,3.01,3.02 106 | 13003,4,3.00,3.00,3.00 107 | 13004,4,3.00,3.00,3.01 108 | 13005,4,3.00,3.00,3.01 109 | 16383,4,3.00,3.01,3.04 110 | 16384,4,3.00,3.01,3.03 111 | 16385,4,3.01,3.01,3.01 112 | 20642,4,3.01,3.02,3.04 113 | 20643,4,3.01,3.03,3.06 114 | 20644,4,3.02,3.03,3.05 115 | 26008,4,3.01,3.02,3.03 116 | 26009,4,3.02,3.03,3.04 117 | 26010,4,3.00,3.01,3.03 118 | 32767,4,3.01,3.03,3.05 119 | 32768,4,3.01,3.03,3.06 120 | 32769,4,3.01,3.04,3.05 121 | 41284,4,3.02,3.03,3.06 122 | 41285,4,3.03,3.05,3.07 123 | 41286,4,3.01,3.01,3.02 124 | 52015,4,3.15,3.16,3.17 125 | 52016,4,3.16,3.17,3.20 126 | 52017,4,3.15,3.17,3.18 127 | 65535,4,3.37,3.37,3.38 128 | 65536,4,3.37,3.37,3.37 129 | 65537,4,3.36,3.37,3.38 130 | 82569,4,3.37,3.37,3.38 131 | 82570,4,3.36,3.36,3.36 132 | 82571,4,3.36,3.38,3.39 133 | 104031,4,3.36,3.37,3.40 134 | 104032,4,3.37,3.37,3.38 135 | 104033,4,3.36,3.37,3.37 136 | -------------------------------------------------------------------------------- /data/firestorm/cache.txt: -------------------------------------------------------------------------------- 1 | hw.perflevel0.l1icachesize: 196608 2 | hw.perflevel0.l1dcachesize: 131072 3 | hw.perflevel0.l2cachesize: 12582912 4 | hw.perflevel1.l1icachesize: 131072 5 | hw.perflevel1.l1dcachesize: 65536 6 | hw.perflevel1.l2cachesize: 4194304 7 | hw.cacheconfig: 8 1 4 0 0 0 0 0 0 0 8 | hw.cachesize: 3708731392 65536 4194304 0 0 0 0 0 0 0 9 | hw.cachelinesize: 128 10 | hw.l1icachesize: 131072 11 | hw.l1dcachesize: 65536 12 | hw.l2cachesize: 4194304 13 | -------------------------------------------------------------------------------- /data/firestorm/ras.csv: -------------------------------------------------------------------------------- 1 | size,min,avg,max 2 | 1,1.00,1.00,1.00 3 | 2,0.50,0.84,1.00 4 | 3,0.67,0.78,1.00 5 | 4,0.75,0.75,0.75 6 | 5,0.60,0.73,0.80 7 | 6,0.67,0.73,0.83 8 | 7,0.71,0.72,0.86 9 | 8,0.62,0.96,4.38 10 | 9,0.67,0.74,3.44 11 | 10,0.70,0.74,1.90 12 | 11,0.64,0.70,0.73 13 | 12,0.67,0.70,0.75 14 | 13,0.69,0.70,0.77 15 | 14,0.64,0.86,15.36 16 | 15,0.67,0.70,0.73 17 | 16,0.69,0.70,0.75 18 | 17,0.65,0.70,0.76 19 | 18,0.67,0.69,0.72 20 | 19,0.68,0.70,1.26 21 | 20,0.70,1.06,1.85 22 | 21,0.67,0.85,2.76 23 | 22,0.68,0.73,2.09 24 | 23,0.65,0.71,1.22 25 | 24,0.67,0.71,1.08 26 | 25,0.68,1.14,4.76 27 | 26,0.69,1.44,14.19 28 | 27,0.67,0.98,3.96 29 | 28,0.68,1.06,5.14 30 | 29,0.66,0.74,2.21 31 | 30,0.67,1.05,2.20 32 | 31,0.68,1.30,6.94 33 | 32,1.06,1.13,4.84 34 | 33,0.67,6.56,34.64 35 | 34,0.71,2.37,8.91 36 | 35,0.74,5.89,12.63 37 | 36,0.78,3.04,16.00 38 | 37,0.81,6.41,8.59 39 | 38,0.84,3.00,10.87 40 | 39,0.87,0.89,1.15 41 | 40,0.90,2.23,8.12 42 | 41,0.93,0.97,1.59 43 | 42,0.95,7.05,95.43 44 | 43,0.98,1.03,1.79 45 | 44,1.00,4.55,48.61 46 | 45,1.02,10.14,122.67 47 | 46,1.04,1.06,1.07 48 | 47,1.06,1.08,1.09 49 | 48,1.08,4.71,34.50 50 | 49,1.10,5.24,76.29 51 | 50,1.12,5.01,50.88 52 | 51,5.57,9.63,65.25 53 | 52,5.48,10.23,95.85 54 | 53,5.40,6.67,36.87 55 | 54,5.31,6.87,55.11 56 | 55,5.24,8.34,27.25 57 | 56,5.16,7.96,38.00 58 | 57,5.11,26.98,348.40 59 | 58,5.03,7.27,56.45 60 | 59,4.97,6.83,53.93 61 | 60,4.90,6.46,45.75 62 | 61,4.85,5.53,9.25 63 | 62,4.77,10.00,47.16 64 | 63,4.71,17.70,202.35 65 | 64,4.66,8.36,51.64 66 | -------------------------------------------------------------------------------- /data/geekerwan/a14_a16_BV1gm4y157No.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu/6a8bf762ae0cfd29fe6eb5bad4a1434e06fb821c/data/geekerwan/a14_a16_BV1gm4y157No.png -------------------------------------------------------------------------------- /data/geekerwan/a16_a17_BV1gm4y157No.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu/6a8bf762ae0cfd29fe6eb5bad4a1434e06fb821c/data/geekerwan/a16_a17_BV1gm4y157No.png -------------------------------------------------------------------------------- /data/geekerwan/a17_BV1gm4y157No.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu/6a8bf762ae0cfd29fe6eb5bad4a1434e06fb821c/data/geekerwan/a17_BV1gm4y157No.png -------------------------------------------------------------------------------- /data/geekerwan/a17_ecore_BV1gm4y157No.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu/6a8bf762ae0cfd29fe6eb5bad4a1434e06fb821c/data/geekerwan/a17_ecore_BV1gm4y157No.png -------------------------------------------------------------------------------- /data/geekerwan/m2_m3_BV1NJ4m1w7zk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiegec/cpu/6a8bf762ae0cfd29fe6eb5bad4a1434e06fb821c/data/geekerwan/m2_m3_BV1NJ4m1w7zk.png -------------------------------------------------------------------------------- /data/golden_cove/ras.csv: -------------------------------------------------------------------------------- 1 | size,min,avg,max 2 | 1,1.16,1.16,1.19 3 | 2,1.07,1.07,1.09 4 | 3,1.06,1.07,1.12 5 | 4,1.04,1.04,1.09 6 | 5,1.03,1.03,1.04 7 | 6,1.19,1.19,1.20 8 | 7,1.16,1.16,1.17 9 | 8,1.14,1.14,1.16 10 | 9,1.13,1.13,1.13 11 | 10,1.11,1.11,1.12 12 | 11,1.10,1.10,1.11 13 | 12,1.09,1.10,1.10 14 | 13,1.09,1.10,1.88 15 | 14,1.15,1.17,1.59 16 | 15,1.14,1.15,1.56 17 | 16,1.13,1.13,1.15 18 | 17,1.13,1.13,1.49 19 | 18,1.12,1.13,1.49 20 | 19,1.11,1.11,1.11 21 | 20,1.11,1.11,1.11 22 | 21,2.20,2.20,2.20 23 | 22,2.05,2.20,2.30 24 | 23,2.05,2.30,3.21 25 | 24,2.31,2.31,2.31 26 | 25,2.28,2.40,2.45 27 | 26,2.35,2.40,2.50 28 | 27,2.41,2.43,2.64 29 | 28,2.47,2.48,2.61 30 | 29,2.52,2.56,2.77 31 | 30,2.57,2.57,2.66 32 | 31,2.62,2.63,2.88 33 | 32,2.66,2.72,3.69 34 | 33,2.86,2.86,2.86 35 | 34,2.76,2.88,2.95 36 | 35,2.89,2.92,2.95 37 | 36,2.81,2.85,3.06 38 | 37,2.84,2.84,2.91 39 | 38,2.87,2.88,3.36 40 | 39,3.08,3.18,3.18 41 | 40,2.93,3.03,3.35 42 | 41,2.97,2.99,3.15 43 | 42,2.98,3.02,3.07 44 | 43,3.01,3.11,3.46 45 | 44,3.03,3.03,3.08 46 | 45,3.07,3.07,3.07 47 | 46,3.07,3.13,3.46 48 | 47,3.09,3.15,3.32 49 | 48,3.31,3.32,3.32 50 | 49,3.14,3.20,3.67 51 | 50,3.14,3.16,3.34 52 | 51,3.16,3.31,3.38 53 | 52,3.18,3.20,3.37 54 | 53,3.23,3.23,3.26 55 | 54,3.22,3.23,3.34 56 | 55,3.23,3.26,3.59 57 | 56,3.23,3.26,3.56 58 | 57,3.25,3.33,3.44 59 | 58,3.26,3.26,3.33 60 | 59,3.28,3.30,3.54 61 | 60,3.29,3.29,3.35 62 | 61,3.31,3.39,3.46 63 | 62,3.31,3.31,3.37 64 | 63,3.33,3.46,5.05 65 | 64,3.33,3.35,3.55 66 | -------------------------------------------------------------------------------- /docs/3a6000.d2: -------------------------------------------------------------------------------- 1 | cpu : Loongson 3A6000 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Chips and Cheese 5 | l1btb: 64-entry L1 BTB, 1 cycle latency 6 | 7 | # Source: Chips and Cheese 8 | ras: 16-entry RAS 9 | } 10 | 11 | l1ic: L1 IC { 12 | # Source: Chips and Cheese 13 | l1ic: 64KB 4-way L1 IC 14 | } 15 | 16 | bp -> l1ic 17 | 18 | iq: Instruction Queue 19 | l1ic -> iq 20 | 21 | # Source: Chips and Cheese 22 | decode: 6-way Decode 23 | iq -> decode 24 | 25 | # Source: Chips and Cheese 26 | rename: 6-way Rename 27 | decode -> rename 28 | } 29 | 30 | backend: Backend { 31 | # Source: Chips and Cheese 32 | rob: 256-entry ROB 33 | 34 | rf: Register File { 35 | # Source: Chips and Cheese 36 | irf: 192-entry Integer Register File 37 | 38 | # Source: Chips and Cheese 39 | vrf: 192-entry Vector Register File 40 | } 41 | 42 | # Source: Chips and Cheese 43 | sched1: 48-entry Integer Scheduler \#1 44 | 45 | # Source: Chips and Cheese 46 | pipe1: Pipe \#1 { 47 | ALU 48 | INT MUL 49 | Branch 50 | } 51 | rob -> sched1 -> rf -> pipe1 52 | 53 | # Source: Chips and Cheese 54 | pipe2: Pipe \#2 { 55 | ALU 56 | INT MUL 57 | Branch 58 | } 59 | rob -> sched1 -> rf -> pipe2 60 | 61 | # Source: Chips and Cheese 62 | pipe3: Pipe \#3 { 63 | ALU 64 | } 65 | rob -> sched1 -> rf -> pipe3 66 | 67 | # Source: Chips and Cheese 68 | pipe4: Pipe \#4 { 69 | ALU 70 | } 71 | rob -> sched1 -> rf -> pipe4 72 | 73 | # Source: Chips and Cheese 74 | sched2: 48-entry Mmeory Scheduler \#2 75 | 76 | # Source: Chips and Cheese 77 | pipe5: Pipe \#5 { 78 | Load AGU 79 | } 80 | rob -> sched2 -> rf -> pipe5 81 | 82 | # Source: Chips and Cheese 83 | pipe6: Pipe \#6 { 84 | Load AGU 85 | } 86 | rob -> sched2 -> rf -> pipe6 87 | 88 | # Source: Chips and Cheese 89 | pipe7: Pipe \#7 { 90 | Store AGU 91 | } 92 | rob -> sched2 -> rf -> pipe7 93 | 94 | # Source: Chips and Cheese 95 | pipe8: Pipe \#8 { 96 | Store AGU 97 | } 98 | rob -> sched2 -> rf -> pipe8 99 | 100 | lsu: LSU { 101 | # Source: Chips and Cheese 102 | 75-entry Load Queue 103 | 64-entry Store Queue 104 | } 105 | 106 | pipe5 -> lsu 107 | pipe6 -> lsu 108 | pipe7 -> lsu 109 | pipe8 -> lsu 110 | 111 | # Source: Chips and Cheese 112 | sched3: 48-entry FP/Vector Scheduler \#3 113 | 114 | # Source: Chips and Cheese 115 | pipe9: Pipe \#9 { 116 | Vec ALU 117 | Vec FADD 118 | FMUL 119 | } 120 | rob -> sched3 -> rf -> pipe9 121 | 122 | # Source: Chips and Cheese 123 | pipe10: Pipe \#10 { 124 | Vec ALU 125 | Vec FADD 126 | FMUL 127 | } 128 | rob -> sched3 -> rf -> pipe10 129 | 130 | # Source: Chips and Cheese 131 | pipe11: Pipe \#11 { 132 | Vec ALU 133 | Vec FADD 134 | Vec FMUL 135 | FADD 136 | } 137 | rob -> sched3 -> rf -> pipe11 138 | 139 | # Source: Chips and Cheese 140 | pipe12: Pipe \#12 { 141 | Vec ALU 142 | Vec FADD 143 | Vec FMUL 144 | FADD 145 | } 146 | rob -> sched3 -> rf -> pipe12 147 | } 148 | frontend.rename -> backend.rob 149 | 150 | mem: Memory { 151 | l1: L1 DC { 152 | # Source: Chips and Cheese 153 | l1dtlb: 64-entry L1 DTLB 154 | 155 | # Source: Chips and Cheese 156 | l1dc: 64KB 4-way L1DC 157 | } 158 | 159 | l2: L2 { 160 | # Source: Chips and Cheese 161 | l2dc: 256KB 4-way L2 Cache 162 | } 163 | l1 -> l2 164 | } 165 | frontend.l1ic -> mem.l2 166 | backend.lsu -> mem.l1 167 | 168 | info: |md 169 | Drawn by Jiajie Chen @jiegec 170 | 171 | Based on data from Chips and Cheese, Loongson 172 | | 173 | } -------------------------------------------------------------------------------- /docs/3a6000.md: -------------------------------------------------------------------------------- 1 | # Loongson 3A6000 2 | 3 | ![](./3a6000.svg) 4 | 5 | References: 6 | 7 | - [Loongson 3A6000: A Star among Chinese CPUs](https://chipsandcheese.com/2024/03/13/loongson-3a6000-a-star-among-chinese-cpus/) 8 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | SRCS=$(wildcard *.d2) 2 | DSTS=$(patsubst %.d2,%.svg,$(SRCS)) 3 | 4 | all: $(DSTS) 5 | 6 | %.svg: %.d2 7 | d2 $^ $@ --layout=elk --sketch=true -------------------------------------------------------------------------------- /docs/ampere_one.d2: -------------------------------------------------------------------------------- 1 | cpu : Ampere One CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Ampere 5 | l1btb: 256-entry L1 BTB, zero bubble 6 | 7 | # Source: Ampere 8 | l2btb: 8192-entry L2 BTB, two bubbles 9 | 10 | # Source: Ampere 11 | indir: indirect branch predictor 12 | 13 | # Source: Ampere 14 | tage: 8-table TAGE direction predictor 15 | 16 | # Source: Ampere 17 | latency: 10-cycle branch mispredict recovery 18 | } 19 | 20 | l1ic: L1 IC { 21 | # Source: Ampere 22 | l1itlb: 64-entry 4-way L1 ITLB 23 | 24 | # Source: Ampere 25 | l1ic: 16KB 4-way L1 IC 26 | } 27 | 28 | # Source: Ampere 29 | fq: 32-entry Fetch Queue 30 | bp -> fq 31 | fq -> l1ic 32 | 33 | # Source: Ampere 34 | iq: Instruction Queue 35 | l1ic -> iq: 5 inst/cycle 36 | 37 | # Source: Ampere 38 | decode: 5-way Decode 39 | iq -> decode 40 | 41 | # Source: Ampere 42 | rename: 4-way Rename { 43 | Macro Fusion 44 | } 45 | decode -> rename: 4 MOP/cycle 46 | } 47 | 48 | backend: Backend { 49 | # Source: Ampere 50 | rob: 208-entry ROB 51 | 52 | rf: Register File { 53 | # Source: Ampere 54 | irf: 166-entry Integer Register File 55 | 56 | # Source: Ampere 57 | vrf: 128-entry FP/Vector Register File 58 | } 59 | 60 | # Source: Ampere 61 | sched1: Integer Scheduler A0 62 | 63 | # Source: Ampere 64 | pipe1: Pipe \#1 { 65 | ALU 66 | Branch 67 | Flag 68 | } 69 | rob -> sched1 -> rf -> pipe1 70 | 71 | # Source: Ampere 72 | sched2: Integer Scheduler B0 73 | 74 | # Source: Ampere 75 | pipe2: Pipe \#2 { 76 | ALU 77 | Complex shift 78 | } 79 | rob -> sched2 -> rf -> pipe2 80 | 81 | # Source: Ampere 82 | sched3: Integer Scheduler B1 83 | 84 | # Source: Ampere 85 | pipe3: Pipe \#3 { 86 | Multicycle 87 | } 88 | rob -> sched2 -> rf -> pipe3 89 | rob -> sched3 -> rf -> pipe3 90 | 91 | # Source: Ampere 92 | pipe4: Pipe \#4 { 93 | ALU 94 | Complex shift 95 | } 96 | rob -> sched3 -> rf -> pipe4 97 | 98 | # Source: Ampere 99 | sched4: Integer Scheduler A1 100 | 101 | # Source: Ampere 102 | pipe5: Pipe \#5 { 103 | ALU 104 | Branch 105 | Flag 106 | } 107 | rob -> sched4 -> rf -> pipe5 108 | 109 | # Source: Ampere 110 | sched5: FP/Vector Scheduler X 111 | 112 | # Source: Ampere 113 | pipe6: Pipe \#6 { 114 | Vector 115 | FP 116 | } 117 | rob -> sched5 -> rf -> pipe6 118 | 119 | # Source: Ampere 120 | sched6: FP/Vector Scheduler Y 121 | 122 | # Source: Ampere 123 | pipe7: Pipe \#7 { 124 | FP store data 125 | } 126 | rob -> sched5 -> rf -> pipe7 127 | rob -> sched6 -> rf -> pipe7 128 | 129 | # Source: Ampere 130 | pipe8: Pipe \#8 { 131 | Vector 132 | FP 133 | } 134 | rob -> sched6 -> rf -> pipe8 135 | 136 | # Source: Ampere 137 | sched7: Memory Scheduler 0 138 | 139 | # Source: Ampere 140 | pipe9: Pipe \#9 { 141 | Load 142 | } 143 | rob -> sched7 -> rf -> pipe9 144 | 145 | # Source: Ampere 146 | pipe10: Pipe \#10 { 147 | Store 148 | } 149 | rob -> sched7 -> rf -> pipe10 150 | 151 | # Source: Ampere 152 | sched8: Memory Scheduler 1 153 | 154 | # Source: Ampere 155 | pipe11: Pipe \#11 { 156 | Load 157 | } 158 | rob -> sched8 -> rf -> pipe11 159 | 160 | # Source: Ampere 161 | pipe12: Pipe \#12 { 162 | Store 163 | } 164 | rob -> sched8 -> rf -> pipe12 165 | 166 | lsu: LSU { 167 | Load Queue 168 | Store Queue 169 | } 170 | 171 | pipe9 -> lsu 172 | pipe10 -> lsu 173 | pipe11 -> lsu 174 | pipe12 -> lsu 175 | } 176 | frontend.rename -> backend.rob 177 | 178 | mem: Memory { 179 | l1: L1 DC { 180 | # Source: Ampere 181 | l1dtlb: 64-entry fully-associative L1 DTLB 182 | 183 | # Source: Ampere 184 | l1dc: 64KB 4-way L1 DC 185 | 186 | # Source: Ampere 187 | l2itlb: 768-entry 6-way L2 ITLB 188 | 189 | # Source: Ampere 190 | l2dtlb: 1536-entry 6-way L2 DTLB 191 | 192 | # Source: Ampere 193 | 4-cycle load to use 194 | 2x128bit load and 1x128bit write per cycle 195 | 8 page table walkers 196 | } 197 | 198 | l2: L2 { 199 | # Source: Ampere 200 | 2MB 8-way L2 Cache 201 | 11-cycle load to use latency 202 | } 203 | 204 | l1 -> l2 205 | 206 | l3: L3 { 207 | 32MB L3 Cache 208 | } 209 | l2 -> l3 210 | } 211 | frontend.l1ic -> mem.l2 212 | backend.lsu -> mem.l1 213 | 214 | info: |md 215 | Drawn by Jiajie Chen @jiegec 216 | 217 | Based on data from Chips and Cheese, Ampere 218 | | 219 | } -------------------------------------------------------------------------------- /docs/ampere_one.md: -------------------------------------------------------------------------------- 1 | # Ampere One 2 | 3 | ![](./ampere_one.svg) 4 | 5 | References: 6 | 7 | - [AmpereOne at Hot Chips 2024: Maximizing Density](https://chipsandcheese.com/2024/08/29/ampereone-at-hot-chips-2024-maximizing-density/) 8 | -------------------------------------------------------------------------------- /docs/avalanche.d2: -------------------------------------------------------------------------------- 1 | cpu : Apple M2 Avalanche CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: JamesAslan 5 | l1btb: 1024-entry L1 BTB, 1 cycle latency 6 | 7 | # Source: JamesAslan 8 | l2btb: 3072-entry L2 BTB, 2 cycle latency 9 | 10 | # Source: JamesAslan 11 | l3btb: 192KB L1 IC as L3 BTB, 3 cycle latency 12 | } 13 | 14 | # Coupled Frontend 15 | l1ic: L1 IC { 16 | # Source: Geekerwan 17 | l1ic: 192KB L1 IC 18 | } 19 | 20 | bp -> l1ic 21 | 22 | # Source: Geekerwan 23 | decode: 8-way Decode 24 | l1ic -> decode 25 | decode -> bp 26 | 27 | # Source: Geekerwan 28 | rename: 8-way Rename 29 | decode -> rename 30 | } 31 | 32 | backend: Backend { 33 | # Source: Geekerwan 34 | rob: 274-entry Coalesced ROB 35 | 36 | rf: Register File { 37 | # Source: Geekerwan 38 | irf: ~350-entry Integer Register File 39 | 40 | # Source: Geekerwan 41 | vrf: ~380-entry 128b Vector Register File 42 | } 43 | 44 | # Source: Geekerwan 45 | dispatch1: 12-entry Dispatch Queue \#1 46 | 47 | # Source: Geekerwan 48 | sched1: 24-entry Scheduler \#1 49 | 50 | # Source: Geekerwan 51 | pipe1: Pipe \#1 { 52 | ALU 53 | BR 54 | FLAGS 55 | ADR 56 | } 57 | dispatch1 -> sched1 -> rf.irf -> pipe1 58 | 59 | # Source: Geekerwan 60 | sched2: 26-entry Scheduler \#2 61 | 62 | # Source: Geekerwan 63 | pipe2: Pipe \#2 { 64 | ALU 65 | BR 66 | FLAGS 67 | ADR 68 | } 69 | dispatch1 -> sched2 -> rf.irf -> pipe2 70 | 71 | # Source: Geekerwan 72 | sched3: 16-entry Scheduler \#3 73 | 74 | # Source: Geekerwan 75 | pipe3: Pipe \#3 { 76 | ALU 77 | FLAGS 78 | } 79 | dispatch1 -> sched3 -> rf.irf -> pipe3 80 | 81 | # Source: Geekerwan 82 | dispatch2: 12-entry Dispatch Queue \#2 83 | 84 | # Source: Geekerwan 85 | sched4: 12-entry Scheduler \#4 86 | 87 | # Source: Geekerwan 88 | pipe4: Pipe \#4 { 89 | ALU 90 | } 91 | dispatch2 -> sched4 -> rf.irf -> pipe4 92 | 93 | # Source: Geekerwan 94 | sched5: 28-entry Scheduler \#5 95 | 96 | # Source: Geekerwan 97 | pipe5: Pipe \#5 { 98 | ALU 99 | MUL 100 | DIV 101 | } 102 | dispatch2 -> sched5 -> rf.irf -> pipe5 103 | 104 | # Source: Geekerwan 105 | sched6: 28-entry Scheduler \#6 106 | 107 | # Source: Geekerwan 108 | pipe6: Pipe \#6 { 109 | ALU 110 | MUL 111 | BFM 112 | MADD 113 | } 114 | dispatch2 -> sched6 -> rf.irf -> pipe6 115 | 116 | # Source: Geekerwan 117 | dispatch3: 10-entry Dispatch Queue \#3 118 | 119 | # Source: Geekerwan 120 | sched7: 52-entry Scheduler \#7 121 | 122 | # Source: Geekerwan 123 | pipe7: Pipe \#7 { 124 | STORE 125 | } 126 | dispatch3 -> sched7 -> rf.irf -> pipe7 127 | 128 | # Source: Geekerwan 129 | pipe8: Pipe \#8 { 130 | LOAD 131 | STORE 132 | } 133 | dispatch3 -> sched7 -> rf.irf -> pipe8 134 | 135 | # Source: Geekerwan 136 | pipe9: Pipe \#9 { 137 | LOAD 138 | } 139 | dispatch3 -> sched7 -> rf.irf -> pipe9 140 | 141 | # Source: Geekerwan 142 | pipe10: Pipe \#10 { 143 | LOAD 144 | } 145 | dispatch3 -> sched7 -> rf.irf -> pipe10 146 | 147 | lsu: LSU { 148 | # Source: Geekerwan 149 | 128-entry Load Queue 150 | 60-entry Store Queue 151 | } 152 | 153 | pipe7 -> lsu 154 | pipe8 -> lsu 155 | pipe9 -> lsu 156 | pipe10 -> lsu 157 | 158 | rob -> dispatch1 159 | rob -> dispatch2 160 | rob -> dispatch3 161 | 162 | # Source: Geekerwan 163 | dispatch4: 12-entry Dispatch Queue \#4 164 | 165 | # Source: Geekerwan 166 | sched8: 36-entry Scheduler \#8 167 | 168 | # Source: Geekerwan 169 | pipe11: Pipe \#11 { 170 | FP 171 | SIMD 172 | } 173 | dispatch4 -> sched8 -> rf.vrf -> pipe11 174 | 175 | # Source: Geekerwan 176 | sched9: 36-entry Scheduler \#9 177 | 178 | # Source: Geekerwan 179 | pipe12: Pipe \#12 { 180 | FP 181 | SIMD 182 | } 183 | dispatch4 -> sched9 -> rf.vrf -> pipe12 184 | 185 | # Source: Geekerwan 186 | sched10: 36-entry Scheduler \#10 187 | 188 | # Source: Geekerwan 189 | pipe13: Pipe \#13 { 190 | FP 191 | SIMD 192 | TO INT 193 | } 194 | dispatch4 -> sched10 -> rf.vrf -> pipe13 195 | 196 | # Source: Geekerwan 197 | sched11: 36-entry Scheduler \#11 198 | 199 | # Source: Geekerwan 200 | pipe14: Pipe \#14 { 201 | FP 202 | SIMD 203 | FSQRT 204 | FCSEL 205 | TO INT 206 | } 207 | dispatch4 -> sched11 -> rf.vrf -> pipe14 208 | rob -> dispatch4 209 | } 210 | frontend.rename -> backend.rob 211 | 212 | mem: Memory { 213 | l1: L1 DC { 214 | # Source: Geekerwan 215 | l1dc: 128KB L1DC 216 | } 217 | } 218 | backend.lsu -> mem.l1 219 | 220 | info: |md 221 | Drawn by Jiajie Chen @jiegec 222 | 223 | Based on data from Geekerwan 224 | | 225 | } -------------------------------------------------------------------------------- /docs/avalanche.md: -------------------------------------------------------------------------------- 1 | # Apple M2 P-core aka Avalanche 2 | 3 | ![](./avalanche.svg) 4 | 5 | References: 6 | 7 | - [Apple M2 Blizzard 微架构评测(上):阳春白雪](https://zhuanlan.zhihu.com/p/675322260) 8 | - [Apple M2 Blizzard 微架构评测(中):阳春白雪](https://zhuanlan.zhihu.com/p/678983061) 9 | - [不为人知的角落,Apple M2 的小小努力(其一)](https://zhuanlan.zhihu.com/p/662561990) 10 | - [苹果 M4 性能分析:尽力了,但芯片工艺快到头了!](https://www.bilibili.com/video/BV1NJ4m1w7zk/) 11 | -------------------------------------------------------------------------------- /docs/cbp.md: -------------------------------------------------------------------------------- 1 | # Reverse Engineered Conditional Branch Predictors 2 | 3 | Reverse engineered conditional branch predictors, using the methodology from the following papers: 4 | 5 | - Half&Half: Demystifying Intel’s Directional Branch Predictors for Fast, Secure Partitioned Execution 6 | - Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon for Software Optimization and Architectural Analysis 7 | 8 | Code can be found at [jiegec/cpu-micro-benchmarks](https://github.com/jiegec/cpu-micro-benchmarks). 9 | 10 | Glossary: 11 | 12 | - B: the branch address; on ARM64, it is the address of the first byte of the instruction; on AMD64, it is the address of the last byte of the instruction 13 | - T: the target address 14 | - PHR: Path History Register 15 | - PHT: Pattern History Table 16 | - footprint: how many bits are xor-ed into PHR for each taken branch 17 | 18 | Overview: 19 | 20 | | uArch | PHR in bits | T bits | B bits | footprint bits | 21 | |---------------------|----------------------|---------|---------|----------------| 22 | | Qualcomm Oryon | `100*1 + 32*1 = 132` | T[31:2] | B[5:2] | 30 + 4 | 23 | | Apple Firestorm | `100*1 + 28*1 = 128` | T[31:2] | B[5:2] | 30 + 4 | 24 | | Apple Icestorm | `60*1 + 16*1 = 76` | T[47:2] | B[5:2] | 46 + 4 | 25 | | ARM Neoverse V1 | `64*3 = 192` | T[7:2] | B[14:2] | 3 | 26 | | ARM Neoverse N1 | `48*3 = 144` | T[7:2] | B[8:2] | 3 | 27 | | Intel Sunny Cove | `194*2 = 388` | T[5:0] | B[15:0] | 16 | 28 | | Intel Golden Cove | `194*2 = 388` | T[5:0] | B[15:0] | 16 | 29 | | Intel Raptor Cove | `194*2 = 388` | T[5:0] | B[15:0] | 16 | 30 | | Intel Redwood Cove | `194*2 = 388` | T[5:0] | B[15:0] | 16 | 31 | | Intel Cascade Lake | `93*2 = 186` | T[5:0] | B[18:3] | 16 | 32 | | Intel Skylake | `93*2 = 186` | T[5:0] | B[18:3] | 16 | 33 | | Intel Haswell | `93*2 = 186` | T[5:0] | B[19:4] | 16 | 34 | | Intel Ivy Bridge | `93*2 = 186` | T[5:0] | B[19:4] | 16 | 35 | 36 | ## Qualcomm Oryon 37 | 38 | - PHRT: 100 bits 39 | - PHRB: 32 bits 40 | - PHRT is updated upon taken branch: `PHRTnew = (PHRTold << 2) xor T[31:2]` 41 | - PHRB is updated upon taken branch: `PHRBnew = (PHRBold << 2) xor B[5:2]` 42 | - PHT: 6 tables, see [Result of Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon](https://github.com/jiegec/cpu-micro-benchmarks/blob/master/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/README.md) 43 | - Source: Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon for Software Optimization and Architectural Analysis 44 | 45 | ## ARM Neoverse V1 46 | 47 | - PHR: `64*3=192` bits 48 | - PHR is updated upon taken branch: `PHRnew = (PHRold << 3) xor footprint` 49 | - footprint has 3 bits: 50 | - footprint[0] = T[2] xor T[5] xor B[3] xor B[6] xor B[9] xor B[12] 51 | - footprint[1] = T[3] xor T[6] xor B[4] xor B[7] xor B[10] xor B[13] 52 | - footprint[2] = T[4] xor T[7] xor B[5] xor B[8] xor B[11] xor B[14] 53 | - Source: [Jiajie Chen](https://github.com/jiegec) 54 | 55 | ## ARM Neoverse N1 56 | 57 | - PHR: `48*3=144` bits 58 | - PHR is updated upon taken branch: `PHRnew = (PHRold << 3) xor footprint` 59 | - footprint has 3 bits: 60 | - footprint[0] = T[2] xor T[5] xor B[3] xor B[6] 61 | - footprint[1] = T[3] xor T[6] xor B[4] xor B[7] 62 | - footprint[2] = T[4] xor T[7] xor B[5] xor B[8] 63 | - Source: [Jiajie Chen](https://github.com/jiegec) 64 | 65 | ## Apple Firestorm 66 | 67 | - PHRT: 100 bits 68 | - PHRB: 28 bits 69 | - PHRT is updated upon taken branch: `PHRTnew = (PHRTold << 2) xor T[31:2]` 70 | - PHRB is updated upon taken branch: `PHRBnew = (PHRBold << 2) xor B[5:2]` 71 | - PHT: 6 tables, see [Result of Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon](https://github.com/jiegec/cpu-micro-benchmarks/blob/master/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/README.md) 72 | - Source: Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon for Software Optimization and Architectural Analysis 73 | 74 | ## Apple Icestorm 75 | 76 | - PHRT: 60 bits 77 | - PHRB: 16 bits 78 | - PHRT is updated upon taken branch: `PHRTnew = (PHRTold << 2) xor T[47:2]` 79 | - PHRB is updated upon taken branch: `PHRBnew = (PHRBold << 2) xor B[5:2]` 80 | - Source: [Jiajie Chen](https://github.com/jiegec) 81 | 82 | ## Intel Haswell/Ivy Bridge 83 | 84 | - PHR: `93*2=186` bits 85 | - PHR is updated upon taken branch: `PHRnew = (PHRold << 2) xor footprint` 86 | - footprint has 16 bits: 87 | - footprint[0] = B[6] xor T[0] 88 | - footprint[1] = B[7] xor T[1] 89 | - footprint[2] = B[10] xor T[2] 90 | - footprint[3] = B[11] xor T[3] 91 | - footprint[4] = B[14] xor T[4] 92 | - footprint[5] = B[15] xor T[5] 93 | - footprint[6] = B[4] 94 | - footprint[7] = B[5] 95 | - footprint[8] = B[8] 96 | - footprint[9] = B[9] 97 | - footprint[10] = B[12] 98 | - footprint[11] = B[13] 99 | - footprint[12] = B[16] 100 | - footprint[13] = B[17] 101 | - footprint[14] = B[18] 102 | - footprint[15] = B[19] 103 | - PHT: 104 | - 3 tables 105 | - each table is 4-way associative 106 | - each table has 9 index bits, including PC[4] 107 | - each table has `4*2^9=2048` entries 108 | - Source: Half&Half: Demystifying Intel’s Directional Branch Predictors for Fast, Secure Partitioned Execution (There is a typo in `B13 xor T5`, which should be `B15 xor T5`) 109 | - Reproduced by [Jiajie Chen](https://github.com/jiegec) 110 | 111 | ## Intel Cascade Lake/Skylake 112 | 113 | - PHR: `93*2=186` bits 114 | - PHR is updated upon taken branch: `PHRnew = (PHRold << 2) xor footprint` 115 | - footprint has 16 bits: 116 | - footprint[0] = B[3] xor T[0] 117 | - footprint[1] = B[4] xor T[1] 118 | - footprint[2] = B[7] xor T[2] 119 | - footprint[3] = B[8] xor T[3] 120 | - footprint[4] = B[11] xor T[4] 121 | - footprint[5] = B[12] xor T[5] 122 | - footprint[6] = B[5] 123 | - footprint[7] = B[6] 124 | - footprint[8] = B[9] 125 | - footprint[9] = B[10] 126 | - footprint[10] = B[13] 127 | - footprint[11] = B[14] 128 | - footprint[12] = B[15] 129 | - footprint[13] = B[16] 130 | - footprint[14] = B[17] 131 | - footprint[15] = B[18] 132 | - PHT: 133 | - 3 tables 134 | - history length of the 3 tables: 22, 58, 186 135 | - each table is 4-way associative 136 | - each table has 9 index bits, including PC[5] 137 | - each table has `4*2^9=2048` entries 138 | - Source: Half&Half: Demystifying Intel’s Directional Branch Predictors for Fast, Secure Partitioned Execution 139 | - Reproduced by [Jiajie Chen](https://github.com/jiegec) 140 | 141 | ## Intel Sunny Cove/Golden Cove/Raptor Cove/Redwood Cove 142 | 143 | - PHR: `194*2=388` bits 144 | - PHR is updated upon taken branch: `PHRnew = (PHRold << 2) xor footprint` 145 | - footprint has 16 bits: 146 | - footprint[0] = B[3] xor T[0] 147 | - footprint[1] = B[4] xor T[1] 148 | - footprint[2] = B[5] 149 | - footprint[3] = B[6] 150 | - footprint[4] = B[7] 151 | - footprint[5] = B[8] 152 | - footprint[6] = B[9] 153 | - footprint[7] = B[10] 154 | - footprint[8] = B[0] xor T[2] 155 | - footprint[9] = B[1] xor T[3] 156 | - footprint[10] = B[2] xor T[4] 157 | - footprint[11] = B[11] xor T[5] 158 | - footprint[12] = B[12] 159 | - footprint[13] = B[13] 160 | - footprint[14] = B[14] 161 | - footprint[15] = B[15] 162 | - PHT: 163 | - 4 tables 164 | - history length of the 4 tables: 36, 68, 132, 388 165 | - each table is 4-way associative 166 | - each table has 9 index bits, including PC[5] 167 | - each table has `4*2^9=2048` entries 168 | - Source: Half&Half: Demystifying Intel’s Directional Branch Predictors for Fast, Secure Partitioned Execution ([Jiajie Chen](https://github.com/jiegec) leads to a different conclusion regarding the PHT tables from the paper) 169 | - Reproduced by [Jiajie Chen](https://github.com/jiegec) 170 | -------------------------------------------------------------------------------- /docs/comparison.md: -------------------------------------------------------------------------------- 1 | # Comparisons across microarchitectures 2 | 3 | ## Frontend 4 | 5 | ### Branch Prediction 6 | 7 | {{ bp_comparison() }} 8 | 9 | ### L1 ICache + ITLB 10 | 11 | {{ l1ic_comparison() }} 12 | 13 | ### Move Elimination / Zeroing Idiom / Ones Idiom 14 | 15 | | Pattern\uArch | Oryon | Firestorm | Golden Cove | Cortex X1 | Zen 3 | Sunny Cove | Zen 1-2 | 16 | |----------------------------|---------|-----------|-------------|-----------|---------|------------|---------| 17 | | # ALU | 6 | 6 | 5 | 4 | 4 | 4 | 4 | 18 | | # Dispatch | 8 | 8 | 6 | 8 | 6 | 5 | 5 | 19 | | Dep int add | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 20 | | Indep int add | 6.0 | 3.9 | 4.7 | 4.0 | 4.0 | 4.0 | 4.0 | 21 | | Dep int mov | 1.2 | 1.2 | **5.5** | 1.3 | **6.0** | **4.6** | **5.0** | 22 | | Indep int mov | **8.0** | **8.0** | **5.4** | 4.0 | **6.0** | **4.6** | **5.0** | 23 | | Dep zero via xor | 1.0 | 1.0 | **5.5** | 1.0 | **6.0** | **4.6** | *4.0* | 24 | | Dep zero via sub | 1.0 | 1.0 | **6.0** | 1.0 | **6.0** | **4.6** | *4.0* | 25 | | Indep set zero via mov | 6.0 | **8.0** | **6.0** | **6.0** | 4.0 | 3.7 | 4.0 | 26 | | Indep set one via mov | 6.0 | **7.8** | **6.0** | 4.0 | 4.0 | 4.0 | 4.0 | 27 | | Indep set two via mov | 6.0 | **7.8** | **6.0** | 4.0 | 4.0 | 4.0 | 4.0 | 28 | | Indep set 1024 via mov | 6.0 | **7.8** | 5.0 | 4.0 | 4.0 | 4.0 | 4.0 | 29 | | Vec dep mov | 0.6 | 0.6 | **6.0** | 0.5 | **6.0** | 1.0 | 4.0 | 30 | | Vec indep mov | **8.0** | **8.0** | **6.0** | 4.0 | **6.0** | 3.0 | 4.0 | 31 | | Vec dep set zero via xor | 0.5 | 0.5 | **6.0** | 0.5 | *4.0* | **5.0** | *4.0* | 32 | | Vec dep set zero via sub | 0.5 | 0.5 | 0.5 | 0.5 | 0.3 | 0.25 | 0.3 | 33 | | Vec indep set zero via mov | 4.0 | **8.0** | N/A | **6.0** | N/A | N/A | N/A | 34 | | Nop | **8.0** | **8.0** | **5.7** | **8.0** | **6.0** | 4.0 | **5.0** | 35 | 36 | - **Bold**: Not executed by ALU/FPU, eliminated at rename stage 37 | - *Italics*: Executed by ALU/FPU, but source register dependency was removed so that dependent ops can be executed in parallel 38 | - Although Cortex-X1 has 8 dispatch width, but it has many limitations on instruction type 39 | 40 | ## Backend 41 | 42 | ### ROB 43 | 44 | {{ rob_comparison() }} 45 | 46 | ### LSU 47 | 48 | | uArch | 64b Load | 64b Store | 128b Load | 128b Store | 256b Load | 256b Store | 49 | |-------------|----------|-----------|-----------|------------|-----------|------------| 50 | | Zen2 | 2/cycle | 1/cycle | 2/cycle | 1/cycle | 2/cycle | 1/cycle | 51 | | Zen4 | 3/cycle | 2/cycle | 2/cycle | 1/cycle | 2/cycle | 1/cycle | 52 | | Golden Cove | 3/cycle | 2/cycle | 3/cycle | 2/cycle | 3/cycle | 2/cycle | 53 | | Firestorm | 3/cycle | 2/cycle | 3/cycle | 2/cycle | 1.5/cycle | 1/cycle | 54 | | Oryon | 4/cycle | 2/cycle | 4/cycle | 2/cycle | 2/cycle | 1/cycle | 55 | 56 | ### Execution Unit 57 | 58 | {{ eu_comparison() }} 59 | 60 | ## Comparison between microarchitectures 61 | 62 | ### Firestorm vs Oryon 63 | 64 | {{ firestorm_oryon_comparison() }} 65 | 66 | ### Cortex-X series 67 | 68 | {{ cortex_x_comparison() }} 69 | 70 | -------------------------------------------------------------------------------- /docs/cortex_a75.d2: -------------------------------------------------------------------------------- 1 | cpu : ARM Cortex-A75 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Chips and Cheese 5 | ras: RAS 6 | 7 | # Source: Chips and Cheese 8 | l0btb: 48-entry L0 BTB 9 | 10 | # Source: Chips and Cheese 11 | l1btb: 3072-entry L0 BTB 12 | } 13 | 14 | l1ic: L1 IC { 15 | # Source: Chips and Cheese 16 | l1ic: 64KB 4-way L1 IC 17 | 18 | # Source: Chips and Cheese 19 | l1itlb: 32-entry L1 ITLB 20 | } 21 | 22 | bp -> l1ic 23 | 24 | # Source: Chips and Cheese 25 | decode: 3-way Decode 26 | l1ic -> decode: 12 bytes/cycle 27 | decode -> bp 28 | 29 | # Source: Chips and Cheese 30 | rename: 3-way Rename 31 | decode -> rename 32 | } 33 | 34 | backend: Backend { 35 | # Source: Chips and Cheese 36 | rob: 73-entry ROB 37 | 38 | rf: Register File { 39 | # Source: Chips and Cheese 40 | irf: 101-entry Integer Register File 41 | 42 | # Source: Chips and Cheese 43 | frf: 89-entry FP Register File 44 | } 45 | 46 | # Source: Chips and Cheese 47 | sched1: Scheduler \#1 48 | 49 | # Source: Chips and Cheese 50 | pipe1: Pipe \#1 { 51 | ALU 52 | } 53 | rob -> sched1 -> rf -> pipe1 54 | 55 | # Source: Chips and Cheese 56 | sched2: Scheduler \#2 57 | 58 | # Source: Chips and Cheese 59 | pipe2: Pipe \#2 { 60 | ALU 61 | INT MUL 62 | } 63 | rob -> sched2 -> rf -> pipe2 64 | 65 | # Source: Chips and Cheese 66 | sched3: Scheduler \#3 67 | 68 | # Source: Chips and Cheese 69 | pipe3: Pipe \#3 { 70 | Branch 71 | } 72 | rob -> sched3 -> rf -> pipe3 73 | 74 | # Source: Chips and Cheese 75 | sched4: Scheduler \#4 76 | 77 | # Source: Chips and Cheese 78 | pipe4: Pipe \#4 { 79 | AGU 80 | } 81 | rob -> sched4 -> rf -> pipe4 82 | 83 | # Source: Chips and Cheese 84 | sched5: Scheduler \#5 85 | 86 | # Source: Chips and Cheese 87 | pipe5: Pipe \#5 { 88 | AGU 89 | } 90 | rob -> sched5 -> rf -> pipe5 91 | 92 | # Source: Chips and Cheese 93 | sched6: Scheduler \#6 94 | 95 | # Source: Chips and Cheese 96 | pipe6: Pipe \#6 { 97 | FMA 98 | 128b ALU 99 | AES 100 | } 101 | rob -> sched6 -> rf -> pipe6 102 | 103 | # Source: Chips and Cheese 104 | sched7: Scheduler \#7 105 | 106 | # Source: Chips and Cheese 107 | pipe7: Pipe \#7 { 108 | FMA 109 | 128b ALU 110 | } 111 | rob -> sched7 -> rf -> pipe7 112 | 113 | lsu: LSU { 114 | # Source: Chips and Cheese 115 | 69-entry Load Queue 116 | 14-entry Store Queue 117 | } 118 | 119 | pipe4 -> lsu 120 | pipe5 -> lsu 121 | } 122 | frontend.rename -> backend.rob 123 | 124 | mem: Memory { 125 | l1: L1 DC { 126 | # Source: Chips and Cheese 127 | l1dc: 64KB 4-way L1DC 128 | l1dtlb: 48-entry L1 DTLB 129 | l2tlb: 1024-entry 4-way L2 TLB 130 | } 131 | 132 | # Source: Chips and Cheese 133 | l2: L2 DC 134 | l1 -> l2 135 | } 136 | backend.lsu -> mem.l1 137 | 138 | info: |md 139 | Drawn by Jiajie Chen @jiegec 140 | 141 | Based on data from Chips and Cheese 142 | | 143 | } -------------------------------------------------------------------------------- /docs/cortex_a75.md: -------------------------------------------------------------------------------- 1 | # ARM Cortex A75 2 | 3 | ![](./cortex_a75.svg) 4 | 5 | References: 6 | 7 | - [Inside SiFive’s P550 Microarchitecture](https://chipsandcheese.com/p/inside-sifives-p550-microarchitecture) 8 | -------------------------------------------------------------------------------- /docs/cortex_a77.d2: -------------------------------------------------------------------------------- 1 | cpu : ARM Cortex-A77 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: ARM 5 | # "4x larger L1-BTB (64-entry, 1-cycle latency)" 6 | l1btb: 64-entry L2 BTB, 0 bubble 7 | 8 | # Source: ARM 9 | # "33% larger main BTB (8K entry, better performance for real workloads)" 10 | l2btb: 8K-entry L2 BTB 11 | } 12 | 13 | l1ic: L1 IC { 14 | # Source: ARM 15 | l1ic: 64KB 4-way L1 IC 16 | 17 | # Source: ARM 18 | l1itlb: 48-entry fully associative L1 ITLB 19 | } 20 | 21 | fq: Fetch Queue 22 | bp -> fq 23 | fq -> l1ic 24 | 25 | iq: Instruction Queue 26 | l1ic -> iq 27 | 28 | # Source: ARM 29 | # "6 Instrs/cycle" 30 | decode: 6-way Decode 31 | iq -> decode 32 | 33 | # Source: ARM 34 | mopc: 1.5K MOP Cache 35 | decode -> mopc 36 | bp -> mopc 37 | 38 | mop: MOP Queue 39 | 40 | # Source: ARM 41 | mopc -> mop 42 | decode -> mop 43 | 44 | # Source: ARM 45 | rename: 6-way Rename 46 | mop -> rename 47 | } 48 | 49 | backend: Backend { 50 | # Source: ARM 51 | # "160 entry instruction window" 52 | rob: 160-entry ROB 53 | 54 | rf: Register File { 55 | irf: Integer Register File 56 | 57 | flagsrf: Flags Register File 58 | 59 | vrf: FP/Vector Register File 60 | } 61 | 62 | # Source: ARM 63 | sched1: ALU Scheduler 64 | 65 | # Source: ARM 66 | pipe1: Pipe \#1 { 67 | Branch 68 | } 69 | rob -> sched1 -> rf -> pipe1 70 | 71 | # Source: ARM 72 | pipe2: Pipe \#2 { 73 | Branch 74 | } 75 | rob -> sched1 -> rf -> pipe2 76 | 77 | # Source: ARM 78 | pipe3: Pipe \#3 { 79 | ALU 80 | } 81 | rob -> sched1 -> rf -> pipe3 82 | 83 | # Source: ARM 84 | pipe4: Pipe \#4 { 85 | ALU 86 | } 87 | rob -> sched1 -> rf -> pipe4 88 | 89 | # Source: ARM 90 | pipe5: Pipe \#5 { 91 | ALU 92 | } 93 | rob -> sched1 -> rf -> pipe5 94 | 95 | # Source: ARM 96 | pipe6: Pipe \#6 { 97 | ALU 98 | MAC 99 | DIV 100 | } 101 | rob -> sched1 -> rf -> pipe6 102 | 103 | # Source: ARM 104 | sched2: FP/Vector Scheduler 105 | 106 | # Source: ARM 107 | pipe7: Pipe \#7 { 108 | FMUL 109 | FADD 110 | FDIV 111 | Vec ALU 112 | IMAC 113 | } 114 | rob -> sched2 -> rf -> pipe7 115 | 116 | # Source: ARM 117 | pipe8: Pipe \#8 { 118 | FMUL 119 | FADD 120 | Vec ALU 121 | } 122 | rob -> sched2 -> rf -> pipe8 123 | 124 | # Source: ARM 125 | sched3: Memory Scheduler 126 | 127 | # Source: ARM 128 | pipe9: Pipe \#9 { 129 | Store Data 130 | } 131 | rob -> sched3 -> rf -> pipe9 132 | 133 | # Source: ARM 134 | pipe10: Pipe \#10 { 135 | Store Data 136 | } 137 | rob -> sched3 -> rf -> pipe10 138 | 139 | # Source: ARM 140 | pipe11: Pipe \#11 { 141 | Load AGU 142 | Store AGU 143 | } 144 | rob -> sched3 -> rf -> pipe11 145 | 146 | # Source: ARM 147 | pipe12: Pipe \#12 { 148 | Load AGU 149 | Store AGU 150 | } 151 | rob -> sched3 -> rf -> pipe12 152 | 153 | lsu: LSU { 154 | Load Queue 155 | Store Queue 156 | # Source: ARM Software Optimization Guide 157 | 4 load to use integer latency 158 | 5 load to use fp latency 159 | 2 load/cycle 160 | 2 store/cycle 161 | } 162 | 163 | pipe9 -> lsu 164 | pipe10 -> lsu 165 | pipe11 -> lsu 166 | pipe12 -> lsu 167 | } 168 | frontend.rename -> backend.rob 169 | 170 | mem: Memory { 171 | l1: L1 DC { 172 | # Source: ARM 173 | l1dc: 64KB 4-way L1 DC 174 | 175 | # Source: ARM 176 | l1dtlb: 48-entry fully associative L1 DTLB 177 | } 178 | 179 | l2: L2 { 180 | # Source: ARM 181 | 128KB/256KB/512KB 8-way L2 Cache 182 | 183 | # Source: ARM 184 | l2tlb: 1280-entry 5-way L2 TLB 185 | } 186 | 187 | l1 -> l2 188 | 189 | l3: L3 { 190 | # Source: ARM 191 | 512KB-4MB shared L3 cache 192 | } 193 | l2 -> l3 194 | } 195 | frontend.l1ic -> mem.l2 196 | backend.lsu -> mem.l1 197 | 198 | info: |md 199 | Drawn by Jiajie Chen @jiegec 200 | 201 | Based on data from ARM and Anandtech 202 | | 203 | } -------------------------------------------------------------------------------- /docs/cortex_a77.md: -------------------------------------------------------------------------------- 1 | # ARM Cortex A77 2 | 3 | ![](./cortex_a77.svg) 4 | 5 | References: 6 | 7 | - [Arm's New Cortex-A77 CPU Micro-architecture: Evolving Performance](https://www.anandtech.com/show/14384/arm-announces-cortexa77-cpu-ip) 8 | - [Arm® Cortex®‑A77 Core Technical Reference Manual](https://developer.arm.com/documentation/101111/0101) 9 | - [Arm Cortex-A77 Core Software Optimization Guide](https://developer.arm.com/documentation/swog011050/latest/) 10 | -------------------------------------------------------------------------------- /docs/cortex_x1.d2: -------------------------------------------------------------------------------- 1 | cpu : ARM Cortex-X1 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: ARM, JamesAslan 5 | # "50% larger L0-BTB capacity, 96 entries (zero-cycle bubble taken-branch latency)" 6 | l1btb: 96-entry L1 BTB, zero bubble, two taken branches per cycle 7 | 8 | # Source: JamesAslan 9 | l2btb: 8192-entry L2 BTB 10 | 11 | # Source: JamesAslan 12 | ras: 16-entry RAS 13 | } 14 | 15 | l1ic: L1 IC { 16 | # Source: JamesAslan 17 | # Source: ARM 18 | l1itlb: 48-entry fully-associative L1 ITLB 19 | 20 | # Source: ARM 21 | l1ic: 4-way 64KB L1 IC 22 | } 23 | 24 | fq: Fetch Queue 25 | bp -> fq 26 | fq -> l1ic 27 | 28 | iq: Instruction Queue 29 | # "5 instruction fetch from the instruction cache" 30 | # "Instructions are first fetched and then decoded into internal 31 | # Macro-OPerations (MOPs). From there, the MOPs proceed through register 32 | # renaming and dispatch stages. A MOP can be split into two Micro- 33 | # OPerations (µOPs) further down the pipeline after the decode stage. Once 34 | # dispatched, µOPs wait for their operands and issue out-of-order to one of 35 | # fifteen issue pipelines. Each issue pipeline can accept one µOP per 36 | # cycle." 37 | l1ic -> iq: 5 inst/cycle 38 | 39 | # Source: ARM 40 | decode: 5-way Decode 41 | iq -> decode 42 | 43 | # Source: ARM 44 | # "3K entries, for increased coverage" 45 | mopc: 3072-entry 4-way skewed-associative MOP Cache 46 | decode -> mopc 47 | bp -> mopc 48 | 49 | mop: MOP Queue 50 | 51 | # Source: ARM 52 | # "8 Mop fetch from the Mop cache" 53 | mopc -> mop: 8 macro ops/cycle 54 | decode -> mop: 5 instructions/cycle 55 | 56 | # Source: ARM 57 | # "The dispatch stage can process up to 8 MOPs per cycle and dispatch up to 58 | # 16 µOPs per cycle, with the following limitations ..." 59 | rename: 8-way Rename { 60 | Zero Idiom 61 | } 62 | mop -> rename 63 | } 64 | 65 | backend: Backend { 66 | # Source: ARM 67 | # "224 entry instruction window" 68 | rob: 224-entry ROB 69 | 70 | rf: Register File { 71 | irf: Integer Register File 72 | 73 | flagsrf: Flags Register File 74 | 75 | vrf: FP/Vector Register File 76 | } 77 | 78 | # Source: ARM 79 | sched1: ALU Scheduler 80 | 81 | # Source: ARM 82 | pipe1: Pipe \#1 { 83 | Branch 84 | } 85 | rob -> sched1 -> rf -> pipe1 86 | 87 | # Source: ARM 88 | pipe2: Pipe \#2 { 89 | Branch 90 | } 91 | rob -> sched1 -> rf -> pipe2 92 | 93 | # Source: ARM 94 | pipe3: Pipe \#3 { 95 | ALU 96 | } 97 | rob -> sched1 -> rf -> pipe3 98 | 99 | # Source: ARM 100 | pipe4: Pipe \#4 { 101 | ALU 102 | } 103 | rob -> sched1 -> rf -> pipe4 104 | 105 | # Source: ARM 106 | pipe5: Pipe \#5 { 107 | ALU 108 | MUL 109 | } 110 | rob -> sched1 -> rf -> pipe5 111 | 112 | # Source: ARM 113 | pipe6: Pipe \#6 { 114 | ALU 115 | MAC 116 | DIV 117 | } 118 | rob -> sched1 -> rf -> pipe6 119 | 120 | # Source: ARM 121 | sched2: FP/Vector Scheduler 122 | 123 | # Source: ARM 124 | pipe7: Pipe \#7 { 125 | FMUL 126 | FADD 127 | FDIV 128 | Vec ALU 129 | IMAC 130 | } 131 | rob -> sched2 -> rf -> pipe7 132 | 133 | # Source: ARM 134 | pipe8: Pipe \#8 { 135 | FMUL 136 | FADD 137 | Vec ALU 138 | } 139 | rob -> sched2 -> rf -> pipe8 140 | 141 | # Source: ARM 142 | pipe9: Pipe \#9 { 143 | FMUL 144 | FADD 145 | FDIV 146 | Vec ALU 147 | IMAC 148 | } 149 | rob -> sched2 -> rf -> pipe9 150 | 151 | # Source: ARM 152 | pipe10: Pipe \#10 { 153 | FMUL 154 | FADD 155 | Vec ALU 156 | } 157 | rob -> sched2 -> rf -> pipe10 158 | 159 | # Source: ARM 160 | sched3: Memory Scheduler 161 | 162 | # Source: ARM 163 | pipe11: Pipe \#11 { 164 | Store Data 165 | } 166 | rob -> sched3 -> rf -> pipe11 167 | 168 | # Source: ARM 169 | pipe12: Pipe \#12 { 170 | Store Data 171 | } 172 | rob -> sched3 -> rf -> pipe12 173 | 174 | # Source: ARM 175 | pipe13: Pipe \#13 { 176 | Load AGU 177 | Store AGU 178 | } 179 | rob -> sched3 -> rf -> pipe13 180 | 181 | # Source: ARM 182 | pipe14: Pipe \#14 { 183 | Load AGU 184 | Store AGU 185 | } 186 | rob -> sched3 -> rf -> pipe14 187 | 188 | # Source: ARM 189 | pipe15: Pipe \#15 { 190 | Load AGU 191 | } 192 | rob -> sched3 -> rf -> pipe15 193 | 194 | lsu: LSU { 195 | Load Queue 196 | Store Queue 197 | 4 load to use integer latency 198 | 6 load to use fp latency 199 | 3 load/cycle 200 | 2 store/cycle 201 | } 202 | 203 | pipe11 -> lsu 204 | pipe12 -> lsu 205 | pipe13 -> lsu 206 | pipe14 -> lsu 207 | pipe15 -> lsu 208 | } 209 | frontend.rename -> backend.rob 210 | 211 | mem: Memory { 212 | l1: L1 DC { 213 | # Source: ARM 214 | l1dtlb: 40-entry fully associative L1 DTLB 215 | 216 | # Source: ARM 217 | l1dc: 64KB L1 DC 218 | 219 | # Source: ARM 220 | l2tlb: 2048-entry 8-way associative 4-bank L2 TLB 221 | } 222 | 223 | l2: L2 { 224 | # Source: ARM 225 | 512KB/1MB 8-way L2 Cache 226 | } 227 | 228 | l1 -> l2 229 | 230 | l3: L3 { 231 | } 232 | l2 -> l3 233 | } 234 | frontend.l1ic -> mem.l2 235 | backend.lsu -> mem.l1 236 | 237 | info: |md 238 | Drawn by Jiajie Chen @jiegec 239 | 240 | Based on data from ARM, JamesAslan, Anandtech and Wikichip 241 | | 242 | } -------------------------------------------------------------------------------- /docs/cortex_x1.md: -------------------------------------------------------------------------------- 1 | # ARM Cortex X1 2 | 3 | ![](./cortex_x1.svg) 4 | 5 | References: 6 | 7 | - [Arm's New Cortex-A78 and Cortex-X1 Microarchitectures: An Efficiency and Performance Divergence](https://www.anandtech.com/show/15813/arm-cortex-a78-cortex-x1-cpu-ip-diverging/3) 8 | - [Arm Cortex-X1: The First From The Cortex-X Custom Program](https://fuse.wikichip.org/news/3543/arm-cortex-x1-the-first-from-the-cortex-x-custom-program/) 9 | - [ARM Cortex X1 微架构评测(上):向山进发](https://zhuanlan.zhihu.com/p/619033328) 10 | - [ARM Cortex X1 微架构(下):向山进发](https://zhuanlan.zhihu.com/p/620310569) 11 | - [Arm® Cortex®‑X1 Core Technical Reference Manual](https://developer.arm.com/documentation/101433/0102) 12 | -------------------------------------------------------------------------------- /docs/cortex_x2.d2: -------------------------------------------------------------------------------- 1 | cpu : ARM Cortex-X2 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | } 5 | 6 | l1ic: L1 IC { 7 | # Source: ARM 8 | l1itlb: 48-entry fully associative L1 ITLB 9 | 10 | # Source: ARM 11 | l1ic: 64KB 4-way L1 IC 12 | } 13 | 14 | fq: Fetch Queue 15 | bp -> fq 16 | fq -> l1ic 17 | 18 | iq: Instruction Queue 19 | l1ic -> iq 20 | 21 | decode: 5-way Decode 22 | iq -> decode 23 | 24 | # Source: ARM 25 | mopc: 3072-entry 4-way MOP Cache 26 | decode -> mopc 27 | bp -> mopc 28 | 29 | mop: MOP Queue 30 | 31 | mopc -> mop 32 | decode -> mop 33 | 34 | # Source: ARM 35 | # "The dispatch stage can process up to 8 MOPs per cycle and dispatch up to 36 | # 16 μOPs per cycle, with the following limitations ..." 37 | rename: 8-way Rename { 38 | Zero Idiom 39 | } 40 | mop -> rename 41 | } 42 | 43 | backend: Backend { 44 | # Source: ARM 45 | rob: 288-MOP-entry ROB 46 | 47 | rf: Register File { 48 | irf: Integer Register File 49 | 50 | flagsrf: Flags Register File 51 | 52 | vrf: FP/Vector Register File 53 | } 54 | 55 | # Source: ARM 56 | sched1: ALU Scheduler 57 | 58 | # Source: ARM 59 | pipe1: Pipe \#1 { 60 | Branch 61 | } 62 | rob -> sched1 -> rf -> pipe1 63 | 64 | # Source: ARM 65 | pipe2: Pipe \#2 { 66 | Branch 67 | } 68 | rob -> sched1 -> rf -> pipe2 69 | 70 | # Source: ARM 71 | pipe3: Pipe \#3 { 72 | ALU 73 | } 74 | rob -> sched1 -> rf -> pipe3 75 | 76 | # Source: ARM 77 | pipe4: Pipe \#4 { 78 | ALU 79 | } 80 | rob -> sched1 -> rf -> pipe4 81 | 82 | # Source: ARM 83 | pipe5: Pipe \#5 { 84 | ALU 85 | MUL 86 | } 87 | rob -> sched1 -> rf -> pipe5 88 | 89 | # Source: ARM 90 | pipe6: Pipe \#6 { 91 | ALU 92 | MAC 93 | DIV 94 | } 95 | rob -> sched1 -> rf -> pipe6 96 | 97 | # Source: ARM 98 | sched2: FP/Vector Scheduler 99 | 100 | # Source: ARM 101 | pipe7: Pipe \#7 { 102 | FMUL 103 | FADD 104 | FDIV 105 | Vec ALU 106 | IMAC 107 | } 108 | rob -> sched2 -> rf -> pipe7 109 | 110 | # Source: ARM 111 | pipe8: Pipe \#8 { 112 | FMUL 113 | FADD 114 | Vec ALU 115 | } 116 | rob -> sched2 -> rf -> pipe8 117 | 118 | # Source: ARM 119 | pipe9: Pipe \#9 { 120 | FMUL 121 | FADD 122 | FDIV 123 | Vec ALU 124 | IMAC 125 | } 126 | rob -> sched2 -> rf -> pipe9 127 | 128 | # Source: ARM 129 | pipe10: Pipe \#10 { 130 | FMUL 131 | FADD 132 | Vec ALU 133 | } 134 | rob -> sched2 -> rf -> pipe10 135 | 136 | # Source: ARM 137 | sched3: Memory Scheduler 138 | 139 | # Source: ARM 140 | pipe11: Pipe \#11 { 141 | Store Data 142 | } 143 | rob -> sched3 -> rf -> pipe11 144 | 145 | # Source: ARM 146 | pipe12: Pipe \#12 { 147 | Store Data 148 | } 149 | rob -> sched3 -> rf -> pipe12 150 | 151 | # Source: ARM 152 | pipe13: Pipe \#13 { 153 | Load AGU 154 | Store AGU 155 | } 156 | rob -> sched3 -> rf -> pipe13 157 | 158 | # Source: ARM 159 | pipe14: Pipe \#14 { 160 | Load AGU 161 | Store AGU 162 | } 163 | rob -> sched3 -> rf -> pipe14 164 | 165 | # Source: ARM 166 | pipe15: Pipe \#15 { 167 | Load AGU 168 | } 169 | rob -> sched3 -> rf -> pipe15 170 | 171 | lsu: LSU { 172 | Load Queue 173 | Store Queue 174 | } 175 | 176 | pipe11 -> lsu 177 | pipe12 -> lsu 178 | pipe13 -> lsu 179 | pipe14 -> lsu 180 | pipe15 -> lsu 181 | } 182 | frontend.rename -> backend.rob 183 | 184 | mem: Memory { 185 | l1: L1 DC { 186 | # Source: ARM 187 | l1dtlb: 48-entry fully associative L1 DTLB 188 | 189 | # Source: ARM 190 | l1dc: 64KB 4-way L1 DC 191 | 192 | # Source: ARM 193 | l2tlb: 2048-entry 8-way L2 TLB 194 | } 195 | 196 | l2: L2 { 197 | # Source: ARM 198 | l2dc: 512KB/1MB 8-way 4 bank L2 Cache 199 | } 200 | 201 | l1 -> l2 202 | 203 | l3: L3 { 204 | } 205 | l2 -> l3 206 | } 207 | frontend.l1ic -> mem.l2 208 | backend.lsu -> mem.l1 209 | 210 | info: |md 211 | Drawn by Jiajie Chen @jiegec 212 | 213 | Based on data from Chips and Cheese, ARM and Anandtech 214 | | 215 | } -------------------------------------------------------------------------------- /docs/cortex_x2.md: -------------------------------------------------------------------------------- 1 | # ARM Cortex X2 2 | 3 | ![](./cortex_x2.svg) 4 | 5 | References: 6 | 7 | - [Cortex X2: Arm Aims High](https://chipsandcheese.com/2023/10/27/cortex-x2-arm-aims-high/) 8 | - [Arm® Cortex®‑X2 Core Technical Reference Manual](https://developer.arm.com/documentation/101803/0200) 9 | - [Arm Announces Mobile Armv9 CPU Microarchitectures: Cortex-X2, Cortex-A710 & Cortex-A510](https://www.anandtech.com/show/16693/arm-announces-mobile-armv9-cpu-microarchitectures-cortexx2-cortexa710-cortexa510/2) 10 | -------------------------------------------------------------------------------- /docs/cortex_x3.d2: -------------------------------------------------------------------------------- 1 | cpu : ARM Cortex-X3 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | } 5 | 6 | l1ic: L1 IC { 7 | # Source: ARM 8 | l1itlb: 48-entry fully-associative L1 ITLB 9 | 10 | # Source: ARM 11 | l1ic: 64KB 4-way L1 IC 12 | } 13 | 14 | fq: Fetch Queue 15 | bp -> fq 16 | fq -> l1ic 17 | 18 | iq: Instruction Queue 19 | l1ic -> iq 20 | 21 | decode: 6-way Decode 22 | iq -> decode 23 | 24 | # Source: ARM 25 | mopc: 1536-entry 4-way skewed-associative MOP Cache 26 | decode -> mopc 27 | bp -> mopc 28 | 29 | mop: MOP Queue 30 | 31 | mopc -> mop: 8 MOP/cycle 32 | decode -> mop: 6 MOP/cycle 33 | 34 | # Source: ARM 35 | # "Cortex-X3 dispatch width: 6 instrs (I$), 8 insts (Mop$)" 36 | # "The dispatch stage can process up to 8 MOPs per cycle and dispatch up to 37 | # 16 µOPs per cycle, with the following limitations ..." 38 | rename: 8-way Rename { 39 | Zero Idiom 40 | } 41 | mop -> rename 42 | } 43 | 44 | backend: Backend { 45 | # Source: ARM 46 | rob: 320-MOP-entry ROB 47 | 48 | rf: Register File { 49 | irf: Integer Register File 50 | 51 | flagsrf: Flags Register File 52 | 53 | vrf: FP/Vector Register File 54 | } 55 | 56 | # Source: ARM 57 | sched1: ALU Scheduler 58 | 59 | # Source: ARM 60 | pipe1: Pipe \#1 { 61 | Branch 62 | } 63 | rob -> sched1 -> rf -> pipe1 64 | 65 | # Source: ARM 66 | pipe2: Pipe \#2 { 67 | Branch 68 | } 69 | rob -> sched1 -> rf -> pipe2 70 | 71 | # Source: ARM 72 | pipe3: Pipe \#3 { 73 | ALU 74 | } 75 | rob -> sched1 -> rf -> pipe3 76 | 77 | # Source: ARM 78 | pipe4: Pipe \#4 { 79 | ALU 80 | } 81 | rob -> sched1 -> rf -> pipe4 82 | 83 | # Source: ARM 84 | pipe5: Pipe \#5 { 85 | ALU 86 | } 87 | rob -> sched1 -> rf -> pipe5 88 | 89 | # Source: ARM 90 | pipe6: Pipe \#6 { 91 | ALU 92 | } 93 | rob -> sched1 -> rf -> pipe6 94 | 95 | # Source: ARM 96 | pipe7: Pipe \#7 { 97 | ALU 98 | MUL 99 | } 100 | rob -> sched1 -> rf -> pipe7 101 | 102 | # Source: ARM 103 | pipe8: Pipe \#8 { 104 | ALU 105 | MAC 106 | DIV 107 | } 108 | rob -> sched1 -> rf -> pipe8 109 | 110 | # Source: ARM 111 | sched2: FP/Vector Scheduler 112 | 113 | # Source: ARM 114 | pipe9: Pipe \#9 { 115 | FMUL 116 | FADD 117 | FDIV 118 | Vec ALU 119 | IMAC 120 | } 121 | rob -> sched2 -> rf -> pipe9 122 | 123 | # Source: ARM 124 | pipe10: Pipe \#10 { 125 | FMUL 126 | FADD 127 | Vec ALU 128 | } 129 | rob -> sched2 -> rf -> pipe10 130 | 131 | # Source: ARM 132 | pipe11: Pipe \#11 { 133 | FMUL 134 | FADD 135 | FDIV 136 | Vec ALU 137 | IMAC 138 | } 139 | rob -> sched2 -> rf -> pipe11 140 | 141 | # Source: ARM 142 | pipe12: Pipe \#12 { 143 | FMUL 144 | FADD 145 | Vec ALU 146 | } 147 | rob -> sched2 -> rf -> pipe12 148 | 149 | # Source: ARM 150 | sched3: Memory Scheduler 151 | 152 | # Source: ARM 153 | pipe13: Pipe \#13 { 154 | Store Data 155 | } 156 | rob -> sched3 -> rf -> pipe13 157 | 158 | # Source: ARM 159 | pipe14: Pipe \#14 { 160 | Store Data 161 | } 162 | rob -> sched3 -> rf -> pipe14 163 | 164 | # Source: ARM 165 | pipe15: Pipe \#15 { 166 | Load AGU 167 | Store AGU 168 | } 169 | rob -> sched3 -> rf -> pipe15 170 | 171 | # Source: ARM 172 | pipe16: Pipe \#16 { 173 | Load AGU 174 | Store AGU 175 | } 176 | rob -> sched3 -> rf -> pipe16 177 | 178 | # Source: ARM 179 | pipe17: Pipe \#17 { 180 | Load AGU 181 | } 182 | rob -> sched3 -> rf -> pipe17 183 | 184 | lsu: LSU { 185 | Load Queue 186 | Store Queue 187 | } 188 | 189 | pipe13 -> lsu 190 | pipe14 -> lsu 191 | pipe15 -> lsu 192 | pipe16 -> lsu 193 | pipe17 -> lsu 194 | } 195 | frontend.rename -> backend.rob 196 | 197 | mem: Memory { 198 | l1: L1 DC { 199 | } 200 | 201 | l2: L2 { 202 | } 203 | 204 | l1 -> l2 205 | 206 | l3: L3 { 207 | } 208 | l2 -> l3 209 | } 210 | frontend.l1ic -> mem.l2 211 | backend.lsu -> mem.l1 212 | 213 | info: |md 214 | Drawn by Jiajie Chen @jiegec 215 | 216 | Based on data from ARM and Wikichip 217 | | 218 | } -------------------------------------------------------------------------------- /docs/cortex_x3.md: -------------------------------------------------------------------------------- 1 | # ARM Cortex X3 2 | 3 | ![](./cortex_x3.svg) 4 | 5 | References: 6 | 7 | - [Arm® Cortex‑X3 Core Technical Reference Manual](https://developer.arm.com/documentation/101593/latest/) 8 | - [Arm Unveils Next-Gen Flagship Core: Cortex-X3](https://fuse.wikichip.org/news/6855/arm-unveils-next-gen-flagship-core-cortex-x3/) 9 | -------------------------------------------------------------------------------- /docs/cortex_x4.d2: -------------------------------------------------------------------------------- 1 | cpu : ARM Cortex-X4 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | } 5 | 6 | l1ic: L1 IC { 7 | # Source: ARM 8 | l1itlb: 48-entry fully-associative L1 ITLB 9 | 10 | # Source: ARM 11 | l1ic: 64KB 4-way L1 IC 12 | } 13 | 14 | fq: Fetch Queue 15 | bp -> fq 16 | fq -> l1ic 17 | 18 | iq: Instruction Queue 19 | l1ic -> iq 20 | 21 | # Source: ARM 22 | decode: 10-way Decode 23 | iq -> decode 24 | 25 | # Source: ARM 26 | # "Cortex-X4 dispatch width: 10 instrs" 27 | # "The dispatch stage can process up to 10 MOPs per cycle and dispatch up to 28 | # 20 μOPs per cycle, with the following limitations ..." 29 | rename: 10-way Rename { 30 | Move Elimination 31 | Zero Idiom 32 | } 33 | decode -> rename 34 | } 35 | 36 | backend: Backend { 37 | # Source: ARM 38 | # "MCQ capacity: 320x2 -> 384x2" 39 | rob: 384-MOP-entry ROB 40 | 41 | rf: Register File { 42 | irf: Integer Register File 43 | 44 | flagsrf: Flags Register File 45 | 46 | vrf: FP/Vector Register File 47 | } 48 | 49 | # Source: ARM 50 | sched1: ALU Scheduler 51 | 52 | # Source: ARM 53 | pipe1: Pipe \#1 { 54 | Branch 55 | } 56 | rob -> sched1 -> rf -> pipe1 57 | 58 | # Source: ARM 59 | pipe2: Pipe \#2 { 60 | Branch 61 | } 62 | rob -> sched1 -> rf -> pipe2 63 | 64 | # Source: ARM 65 | pipe3: Pipe \#3 { 66 | Branch 67 | } 68 | rob -> sched1 -> rf -> pipe3 69 | 70 | # Source: ARM 71 | pipe4: Pipe \#4 { 72 | ALU 73 | } 74 | rob -> sched1 -> rf -> pipe4 75 | 76 | # Source: ARM 77 | pipe5: Pipe \#5 { 78 | ALU 79 | } 80 | rob -> sched1 -> rf -> pipe5 81 | 82 | # Source: ARM 83 | pipe6: Pipe \#6 { 84 | ALU 85 | } 86 | rob -> sched1 -> rf -> pipe6 87 | 88 | # Source: ARM 89 | pipe7: Pipe \#7 { 90 | ALU 91 | } 92 | rob -> sched1 -> rf -> pipe7 93 | 94 | # Source: ARM 95 | pipe8: Pipe \#8 { 96 | ALU 97 | } 98 | rob -> sched1 -> rf -> pipe8 99 | 100 | # Source: ARM 101 | pipe9: Pipe \#9 { 102 | ALU 103 | } 104 | rob -> sched1 -> rf -> pipe9 105 | 106 | # Source: ARM 107 | pipe10: Pipe \#10 { 108 | ALU 109 | MAC 110 | } 111 | rob -> sched1 -> rf -> pipe10 112 | 113 | # Source: ARM 114 | pipe11: Pipe \#11 { 115 | ALU 116 | MAC 117 | DIV 118 | } 119 | rob -> sched1 -> rf -> pipe11 120 | 121 | # Source: ARM 122 | sched2: FP/Vector Scheduler 123 | 124 | # Source: ARM 125 | pipe12: Pipe \#12 { 126 | FMUL 127 | FADD 128 | FDIV 129 | Vec ALU 130 | IMAC 131 | } 132 | rob -> sched2 -> rf -> pipe12 133 | 134 | # Source: ARM 135 | pipe13: Pipe \#13 { 136 | FMUL 137 | FADD 138 | Vec ALU 139 | } 140 | rob -> sched2 -> rf -> pipe13 141 | 142 | # Source: ARM 143 | pipe14: Pipe \#14 { 144 | FMUL 145 | FADD 146 | FDIV 147 | Vec ALU 148 | IMAC 149 | } 150 | rob -> sched2 -> rf -> pipe14 151 | 152 | # Source: ARM 153 | pipe15: Pipe \#15 { 154 | FMUL 155 | FADD 156 | Vec ALU 157 | } 158 | rob -> sched2 -> rf -> pipe15 159 | 160 | # Source: ARM 161 | sched3: Memory Scheduler 162 | 163 | # Source: ARM 164 | pipe16: Pipe \#16 { 165 | Store Data 166 | } 167 | rob -> sched3 -> rf -> pipe16 168 | 169 | # Source: ARM 170 | pipe17: Pipe \#17 { 171 | Store Data 172 | } 173 | rob -> sched3 -> rf -> pipe17 174 | 175 | # Source: ARM 176 | pipe18: Pipe \#18 { 177 | Load AGU 178 | Store AGU 179 | } 180 | rob -> sched3 -> rf -> pipe18 181 | 182 | # Source: ARM 183 | pipe19: Pipe \#19 { 184 | Load AGU 185 | } 186 | rob -> sched3 -> rf -> pipe19 187 | 188 | # Source: ARM 189 | pipe20: Pipe \#20 { 190 | Load AGU 191 | } 192 | rob -> sched3 -> rf -> pipe20 193 | 194 | # Source: ARM 195 | pipe21: Pipe \#21 { 196 | Store AGU 197 | } 198 | rob -> sched3 -> rf -> pipe21 199 | 200 | lsu: LSU { 201 | Load Queue 202 | Store Queue 203 | } 204 | 205 | pipe16 -> lsu 206 | pipe17 -> lsu 207 | pipe18 -> lsu 208 | pipe19 -> lsu 209 | pipe20 -> lsu 210 | pipe21 -> lsu 211 | } 212 | frontend.rename -> backend.rob 213 | 214 | mem: Memory { 215 | l1: L1 DC { 216 | # Source: ARM 217 | l1dtlb: 96-entry fully-associative L1 DTLB 218 | 219 | # Source: ARM 220 | l1dc: 64KB 4-way L1 DC 221 | 222 | # Source: ARM 223 | l2tlb: 2048-entry 8-way L2 TLB 224 | } 225 | 226 | l2: L2 { 227 | # Source: ARM 228 | 512KB/1MB/2MB 8-way 4-bank L2 Cache 229 | } 230 | 231 | l1 -> l2 232 | 233 | l3: L3 { 234 | 32MB L3 Cache 235 | } 236 | l2 -> l3 237 | } 238 | frontend.l1ic -> mem.l2 239 | backend.lsu -> mem.l1 240 | 241 | info: |md 242 | Drawn by Jiajie Chen @jiegec 243 | 244 | Based on data from ARM and Anandtech 245 | | 246 | } -------------------------------------------------------------------------------- /docs/cortex_x4.md: -------------------------------------------------------------------------------- 1 | # ARM Cortex X4 2 | 3 | ![](./cortex_x4.svg) 4 | 5 | References: 6 | 7 | - [Arm Unveils 2023 Mobile CPU Core Designs: Cortex-X4, A720, and A520 - the Armv9.2 Family](https://www.anandtech.com/show/18871/arm-unveils-armv92-mobile-architecture-cortex-x4-a720-and-a520-64bit-exclusive/2) 8 | - [Arm® Cortex-X4 Core Technical Reference Manual](https://developer.arm.com/documentation/102484/latest/) 9 | -------------------------------------------------------------------------------- /docs/cortex_x925.d2: -------------------------------------------------------------------------------- 1 | cpu : ARM Cortex-X925 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | } 5 | 6 | l1ic: L1 IC { 7 | # Source: ARM 8 | l1itlb: 128-entry fully-associative L1 ITLB 9 | 10 | # Source: ARM 11 | l1ic: 64KB 4-way L1 IC 12 | } 13 | 14 | fq: Fetch Queue 15 | bp -> fq 16 | fq -> l1ic 17 | 18 | iq: Instruction Queue 19 | l1ic -> iq 20 | 21 | # Source: ARM 22 | decode: 10-way Decode 23 | iq -> decode 24 | 25 | # Source: ARM 26 | # "The dispatch stage can process up to 10 MOPs per cycle and dispatch up to 27 | # 20 µOPs per cycle, with the following limitations ..." 28 | rename: 10-way Rename { 29 | Sign Extension Elimination 30 | Move Elimination 31 | Zero Idiom 32 | } 33 | decode -> rename 34 | } 35 | 36 | backend: Backend { 37 | rob: ROB 38 | 39 | rf: Register File { 40 | irf: Integer Register File 41 | 42 | flagsrf: Flags Register File 43 | 44 | vrf: FP/Vector Register File 45 | } 46 | 47 | # Source: ARM 48 | sched1: ALU Scheduler 49 | 50 | # Source: ARM 51 | pipe1: Pipe \#1 { 52 | Branch 53 | } 54 | rob -> sched1 -> rf -> pipe1 55 | 56 | # Source: ARM 57 | pipe2: Pipe \#2 { 58 | Branch 59 | } 60 | rob -> sched1 -> rf -> pipe2 61 | 62 | # Source: ARM 63 | pipe3: Pipe \#3 { 64 | Branch 65 | } 66 | rob -> sched1 -> rf -> pipe3 67 | 68 | # Source: ARM 69 | pipe4: Pipe \#4 { 70 | ALU 71 | } 72 | rob -> sched1 -> rf -> pipe4 73 | 74 | # Source: ARM 75 | pipe5: Pipe \#5 { 76 | ALU 77 | MUL 78 | } 79 | rob -> sched1 -> rf -> pipe5 80 | 81 | # Source: ARM 82 | pipe6: Pipe \#6 { 83 | ALU 84 | } 85 | rob -> sched1 -> rf -> pipe6 86 | 87 | # Source: ARM 88 | pipe7: Pipe \#7 { 89 | ALU 90 | MUL 91 | } 92 | rob -> sched1 -> rf -> pipe7 93 | 94 | # Source: ARM 95 | pipe8: Pipe \#8 { 96 | ALU 97 | } 98 | rob -> sched1 -> rf -> pipe8 99 | 100 | # Source: ARM 101 | pipe9: Pipe \#9 { 102 | ALU 103 | MUL 104 | } 105 | rob -> sched1 -> rf -> pipe9 106 | 107 | # Source: ARM 108 | pipe10: Pipe \#10 { 109 | ALU 110 | DIV 111 | CRC 112 | } 113 | rob -> sched1 -> rf -> pipe10 114 | 115 | # Source: ARM 116 | pipe11: Pipe \#11 { 117 | ALU 118 | MUL 119 | CRC 120 | } 121 | rob -> sched1 -> rf -> pipe11 122 | 123 | # Source: ARM 124 | sched2: FP/Vector Scheduler 125 | 126 | # Source: ARM 127 | pipe12: Pipe \#12 { 128 | Vec ALU 129 | Vec INT MUL 130 | FADD 131 | FMUL 132 | Crypto 133 | } 134 | rob -> sched2 -> rf -> pipe12 135 | 136 | # Source: ARM 137 | pipe13: Pipe \#13 { 138 | Vec ALU 139 | FMUL 140 | FADD 141 | FDIV 142 | FSQRT 143 | Crypto 144 | } 145 | rob -> sched2 -> rf -> pipe13 146 | 147 | # Source: ARM 148 | pipe14: Pipe \#14 { 149 | Vec ALU 150 | FMUL 151 | FADD 152 | } 153 | rob -> sched2 -> rf -> pipe14 154 | 155 | # Source: ARM 156 | pipe15: Pipe \#15 { 157 | Vec ALU 158 | Vec INT MUL 159 | FMUL 160 | FADD 161 | Crypto 162 | } 163 | rob -> sched2 -> rf -> pipe15 164 | 165 | # Source: ARM 166 | pipe16: Pipe \#16 { 167 | Vec ALU 168 | FMUL 169 | FADD 170 | Crypto 171 | } 172 | rob -> sched2 -> rf -> pipe16 173 | 174 | # Source: ARM 175 | pipe17: Pipe \#17 { 176 | Vec ALU 177 | FMUL 178 | FADD 179 | } 180 | rob -> sched2 -> rf -> pipe17 181 | 182 | # Source: ARM 183 | sched3: Memory Scheduler 184 | 185 | # Source: ARM 186 | pipe18: Pipe \#18 { 187 | Store Data 188 | } 189 | rob -> sched3 -> rf -> pipe18 190 | 191 | # Source: ARM 192 | pipe19: Pipe \#19 { 193 | Store Data 194 | } 195 | rob -> sched3 -> rf -> pipe19 196 | 197 | # Source: ARM 198 | pipe20: Pipe \#20 { 199 | Load AGU 200 | Store AGU 201 | } 202 | rob -> sched3 -> rf -> pipe20 203 | 204 | # Source: ARM 205 | pipe21: Pipe \#21 { 206 | Load AGU 207 | Store AGU 208 | } 209 | rob -> sched3 -> rf -> pipe21 210 | 211 | # Source: ARM 212 | pipe22: Pipe \#22 { 213 | Load AGU 214 | } 215 | rob -> sched3 -> rf -> pipe22 216 | 217 | # Source: ARM 218 | pipe23: Pipe \#23 { 219 | Load AGU 220 | } 221 | rob -> sched3 -> rf -> pipe23 222 | 223 | lsu: LSU { 224 | Load Queue 225 | Store Queue 226 | } 227 | 228 | pipe18 -> lsu 229 | pipe19 -> lsu 230 | pipe20 -> lsu 231 | pipe21 -> lsu 232 | pipe22 -> lsu 233 | pipe23 -> lsu 234 | } 235 | frontend.rename -> backend.rob 236 | 237 | mem: Memory { 238 | l1: L1 DC { 239 | # Source: ARM 240 | l1dtlb: 96-entry fully-associative L1 DTLB 241 | 242 | # Source: ARM 243 | l1dc: 64KB 4-way L1 DC 244 | 245 | # Source: ARM 246 | l2tlb: 2048-entry 8-way L2 TLB 247 | } 248 | 249 | l2: L2 { 250 | # Source: ARM 251 | 2MB 8-way or 3MB 12-way, 4-bank L2 Cache 252 | } 253 | 254 | l1 -> l2 255 | 256 | l3: L3 { 257 | } 258 | l2 -> l3 259 | } 260 | frontend.l1ic -> mem.l2 261 | backend.lsu -> mem.l1 262 | 263 | info: |md 264 | Drawn by Jiajie Chen @jiegec 265 | 266 | Based on data from ARM and Anandtech 267 | | 268 | } -------------------------------------------------------------------------------- /docs/cortex_x925.md: -------------------------------------------------------------------------------- 1 | # ARM Cortex X925 2 | 3 | ![](./cortex_x925.svg) 4 | 5 | References: 6 | 7 | - [Arm Unveils 2024 CPU Core Designs, Cortex X925, A725 and A520: Arm v9.2 Redefined For 3nm](https://www.anandtech.com/show/21399/arm-unveils-2024-cpu-core-designs-cortex-x925-a725-and-a520-arm-v9-2-redefined-for-3nm-/2) 8 | - [Arm® Cortex-X925 Core Technical Reference Manual](https://developer.arm.com/documentation/102807/0001) 9 | - [Arm® Cortex-X925 Core Software Optimization Guide](https://developer.arm.com/documentation/109842/latest/) 10 | - [Cortex X925 微架构评测:路在何方](https://zhuanlan.zhihu.com/p/945571328) 11 | -------------------------------------------------------------------------------- /docs/crestmont.d2: -------------------------------------------------------------------------------- 1 | cpu: Intel Meteor Lake E-core Crestmont CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Chips and Cheese 5 | l1btb: 1024-entry L1 BTB 6 | 7 | # Source: Chips and Cheese, Intel 8 | # Intel: "Larger Branch Target Buffer (6K entry from 5K) with Enhanced 9 | # Path Based Branch Prediction." 10 | l2btb: 6144-way L2 BTB 11 | 12 | # Source: Chips and Cheese 13 | ind: >=512-entry Indirect Target Array 14 | 15 | # Source: Intel 16 | # "Increased Branch Prediction Bandwidth (128B/cycle max from 32B/cycle on 17 | # Gracemont)." 18 | bw: 128B/cycle prediction bandwidth 19 | } 20 | 21 | l1ic: L1 IC { 22 | # Source: Chips and Cheese, Intel 23 | # Intel: 24 | # Level Entries Associativity Architectural Page Size Cached Translation Size 25 | # ITLB 64 Fully associative All 4KB, 256KB 26 | itlb: 64-entry fully associative ITLB 27 | 28 | # Source: Chips and Cheese 29 | l1ic: 64KB 8-way L1 IC 30 | } 31 | 32 | # Source: Chips and Cheese 33 | fq: 2x Fetch Queue 34 | bp -> fq 35 | fq -> l1ic 36 | 37 | iq1: Instruction Queue \#1 38 | l1ic -> iq1 39 | 40 | # Source: Chips and Cheese 41 | decode1: 3-way Decode \#1 42 | iq1 -> decode1 43 | 44 | # Source: Chips and Cheese 45 | uop1: UOP Queue \#1 46 | decode1 -> uop1 47 | 48 | iq2: Instruction Queue \#2 49 | l1ic -> iq2 50 | 51 | # Source: Chips and Cheese 52 | decode2: 3-way Decode \#2 53 | iq2 -> decode2 54 | 55 | # Source: Chips and Cheese 56 | uop2: UOP Queue \#2 57 | decode2 -> uop2 58 | 59 | # Source: Chips and Cheese, Intel 60 | # Intel: "Wider allocation width (6-wide from 5-wide)" 61 | rename: 6-way Rename { 62 | Move Elimination 63 | Zero Idiom 64 | } 65 | uop1 -> rename 66 | uop2 -> rename 67 | } 68 | 69 | backend: Backend { 70 | # Source: Chips and Cheese, Intel 71 | # Intel: "For the 256-entry retirement buffer, retirement can be up to eight 72 | # instructions per cycle." 73 | rob: 256-entry ROB, 8-wide retirement 74 | 75 | # Source: Chips and Cheese 76 | bob: 116-taken-entry 126-not-taken-entry Branch Order Buffer 77 | 78 | rf: Register File { 79 | # Source: Chips and Cheese 80 | irf: 214-entry Integer Register File 81 | 82 | # Source: Chips and Cheese 83 | vrf: 207-entry 128b Vector Register File 84 | } 85 | 86 | # Source: Chips and Cheese 87 | sched1: 16-entry ALU Scheduler \#1 88 | 89 | # Source: Chips and Cheese 90 | pipe1: Port 0 { 91 | grid-columns: 1 92 | ALU 93 | } 94 | rob -> sched1 -> rf.irf -> pipe1 95 | 96 | # Source: Chips and Cheese 97 | sched2: 16-entry ALU Scheduler \#2 98 | 99 | # Source: Chips and Cheese 100 | pipe2: Port 1 { 101 | grid-columns: 1 102 | ALU 103 | INT MUL 104 | INT DIV 105 | LEA 106 | } 107 | rob -> sched2 -> rf.irf -> pipe2 108 | 109 | # Source: Chips and Cheese 110 | sched3: 16-entry ALU Scheduler \#3 111 | 112 | # Source: Chips and Cheese 113 | pipe3: Port 2 { 114 | grid-columns: 1 115 | ALU 116 | INT MUL 117 | INT DIV 118 | PDEP 119 | } 120 | rob -> sched3 -> rf.irf -> pipe3 121 | 122 | # Source: Chips and Cheese 123 | sched4: 16-entry ALU Scheduler \#4 124 | 125 | # Source: Chips and Cheese 126 | pipe4: Port 3 { 127 | grid-columns: 1 128 | ALU 129 | } 130 | rob -> sched4 -> rf.irf -> pipe4 131 | 132 | # Source: Chips and Cheese 133 | sched5: 42-entry Branch/Store Data Scheduler \#5 134 | 135 | # Source: Chips and Cheese 136 | pipe5: Port 30 { 137 | JMP/Branch 138 | } 139 | rob -> sched5 -> rf.irf -> pipe5 140 | 141 | # Source: Chips and Cheese 142 | pipe6: Port 31 { 143 | JMP/Branch 144 | } 145 | rob -> sched5 -> rf.irf -> pipe6 146 | 147 | # Source: Chips and Cheese 148 | pipe7: Port 12 { 149 | Store AGU 150 | } 151 | rob -> sched5 -> rf.irf -> pipe7 152 | 153 | # Source: Chips and Cheese 154 | pipe8: Port 13 { 155 | Store AGU 156 | } 157 | rob -> sched5 -> rf.irf -> pipe8 158 | 159 | # Source: Chips and Cheese 160 | nsq1: 22-entry AGU Non-Scheduling Queue \#1 161 | 162 | # Source: Chips and Cheese 163 | sched6: 22-entry AGU Scheduler \#6 164 | 165 | # Source: Chips and Cheese 166 | pipe9: Port 10 { 167 | Load AGU 168 | } 169 | rob -> nsq1 -> sched6 -> rf.irf -> pipe9 170 | 171 | # Source: Chips and Cheese 172 | pipe10: Port 11 { 173 | Load AGU 174 | } 175 | rob -> nsq1 -> sched6 -> rf.irf -> pipe10 176 | 177 | # Source: Chips and Cheese 178 | pipe11: Port 8 { 179 | Store Data 180 | } 181 | rob -> nsq1 -> sched6 -> rf.irf -> pipe11 182 | 183 | # Source: Chips and Cheese 184 | pipe12: Port 9 { 185 | Store Data 186 | } 187 | rob -> nsq1 -> sched6 -> rf.irf -> pipe12 188 | 189 | lsu: LSU { 190 | # Source: Chips and Cheese 191 | 80-entry Load Queue 192 | 48-entry Store Queue 193 | 194 | # Source: Intel 195 | # "The Crestmont microarchitecture’s memory subsystem is designed to handle 196 | # two 16 byte loads and two 16 byte stores per cycle, providing 197 | # simultaneous 32 bytes of read bandwidth and 32 bytes of write bandwidth 198 | # per cycle. The load-to-use latency for loads is typically four cycles. 199 | # Suppose you are doing a pointer-chasing operation where the computed 200 | # address results from a single prior load and a positive displacement of 201 | # no more than +1023. In that case, the load-to-use latency observed can 202 | # be reduced to three cycles." 203 | 2x128b load and 2x128b store per cycle 204 | 4 cycle load to use latency 205 | 3 cycle load to use latency in pointer chasing 206 | } 207 | 208 | pipe7 -> lsu 209 | pipe8 -> lsu 210 | pipe9 -> lsu 211 | pipe10 -> lsu 212 | pipe11 -> lsu 213 | pipe12 -> lsu 214 | 215 | # Source: Chips and Cheese 216 | nsq2: 57-entry FP Non-Scheduling Queue \#2 217 | 218 | # Source: Chips and Cheese 219 | sched7: 22-entry FStore Scheduler \#7 220 | 221 | # Source: Chips and Cheese 222 | pipe13: Port 28 { 223 | FP Store Data 224 | } 225 | rob -> nsq2 -> sched7 -> rf.vrf -> pipe13 226 | 227 | # Source: Chips and Cheese 228 | pipe14: Port 29 { 229 | FP Store Data 230 | } 231 | rob -> nsq2 -> sched7 -> rf.vrf -> pipe14 232 | 233 | # Source: Chips and Cheese 234 | sched8: 38-entry FP/Vector Scheduler \#8 235 | 236 | # Source: Chips and Cheese 237 | pipe15: Port 20 { 238 | grid-columns: 1 239 | INT Vec ALU 240 | INT Vec MUL 241 | FMA 242 | FADD 243 | FMUL 244 | AES 245 | SHA 246 | } 247 | rob -> nsq2 -> sched8 -> rf.vrf -> pipe15 248 | 249 | # Source: Chips and Cheese 250 | pipe16: Port 21 { 251 | grid-columns: 1 252 | INT Vec ALU 253 | INT Vec MUL 254 | FMA 255 | FADD 256 | FMUL 257 | AES 258 | } 259 | rob -> nsq2 -> sched8 -> rf.vrf -> pipe16 260 | 261 | # Source: Chips and Cheese 262 | pipe17: Port 22 { 263 | INT Vec ALU 264 | } 265 | rob -> nsq2 -> sched8 -> rf.vrf -> pipe17 266 | } 267 | 268 | frontend.rename -> backend.rob 269 | frontend.rename -> backend.bob 270 | 271 | mem: Memory { 272 | l1: L1 DC { 273 | # Source: Chips and Cheese, Intel 274 | # Level Entries Associativity Architectural Page Size Cached Translation Size 275 | # DTLB 48 Fully associative All 4KB, 2MB 276 | l1dtlb: 48-entry fully associative L1 DTLB 277 | 278 | # Source: Chips and Cheese 279 | l1dc: 32KB 8-way L1DC 280 | } 281 | 282 | l2: L2 { 283 | # Source: Chips and Cheese, Intel 284 | # Level Entries Associativity Architectural Page Size Cached Translation Size 285 | # STLB 3072 6-way 4K/2M/4M 4KB, 2MB 286 | # STLB 16 Fully associative 1GB 1GB 287 | l2tlb: 3072-entry 6-way 4K/2M/4M page, 16-entry fully associative 2G page L2 TLB 288 | 289 | # Source: Chips and Cheese, Intel 290 | # "The L2 cache delivers 64 bytes of data per cycle at a latency of 17 291 | # cycles, and that bandwidth is shared amongst 4 cores." 292 | l2dc: 2MB 16-way Shared L2 Cache, 17 cycle latency 293 | } 294 | 295 | # Source: Chips and Cheese 296 | l1 -> l2: 64B/cycle shared among 4 cores 297 | 298 | l3: L3 { 299 | # Source: Chips and Cheese 300 | l3dc: 24MB 12-way L3 Cache 301 | } 302 | l2 -> l3 303 | } 304 | frontend.l1ic -> mem.l2 305 | backend.lsu -> mem.l1 306 | 307 | info: |md 308 | Drawn by Jiajie Chen @jiegec 309 | 310 | Based on data from Chips and Cheese 311 | | 312 | } 313 | -------------------------------------------------------------------------------- /docs/crestmont.md: -------------------------------------------------------------------------------- 1 | # Intel Meteor Lake E-core aka Crestmont 2 | 3 | ![](./crestmont.svg) 4 | 5 | References: 6 | 7 | - [Meteor Lake’s E-Cores: Crestmont Makes Incremental Progress](https://chipsandcheese.com/2024/05/13/meteor-lakes-e-cores-crestmont-makes-incremental-progress/) 8 | - Intel 64 and IA-32 Architectures Optimization Reference Manual Volume 1 9 | - [The Next Generation of High Performance, Energy-Efficient Computing: Intel® Xeon® Processors Built on Efficient-Core](https://hc2023.hotchips.org/assets/program/conference/day1/CPU2/HC2023.Intel.Soltis.FINAL.pdf) 10 | -------------------------------------------------------------------------------- /docs/dieshot.md: -------------------------------------------------------------------------------- 1 | # Dieshot 2 | 3 | - AMD Ryzen 9 9900x: https://www.bilibili.com/opus/965843745820901377 4 | - AMD Strix Point: https://www.bilibili.com/opus/959217298443337751 5 | - Apple A17: https://www.youtube.com/watch?v=veikj5uvAc8 6 | - Google Tensor Gen1: https://twitter.com/Kurnalsalts/status/1792171201076551747 https://www.bilibili.com/opus/942059110279413798 7 | - Google Tensor Gen2: https://twitter.com/Kurnalsalts/status/1792171201076551747 https://www.bilibili.com/opus/942059110279413798 8 | - Google Tensor Gen3: https://twitter.com/Kurnalsalts/status/1799097967884083393 https://www.bilibili.com/opus/942059110279413798 9 | - Intel Meteor Lake: https://www.bilibili.com/opus/967751673346785305 10 | - MediaTek Dimensity 9200: 1x Cortex-X3, 3x Cortex-A715, 4x Cortex-A510, Immortalis-G715 https://i.mediatek.com/dimensity-9200 https://kurnal.xlog.app/D9200 11 | - MediaTek Dimensity 9300: 4x Cortex-X4, 4x Cortex-A720, Immortalis-G720 https://i.mediatek.com/dimensity-9300 https://twitter.com/Kurnalsalts/status/1767976930446553369 12 | - MediaTek Dimensity 930: 2x Cortex-A78, 6x Cortex-A55 https://i.mediatek.com/dimensity-930 https://twitter.com/Kurnalsalts/status/1727746638574420285 13 | - MediaTek Helio X30: 2x Cortex-A73, 4x Cortex-A53, 4x Cortex-A35 https://www.mediatek.com/products/smartphones-2/mediatek-helio-x30 https://twitter.com/Kurnalsalts/status/1790624036613218466 14 | - Nvidia AD102: https://twitter.com/Kurnalsalts/status/1784611359608680563 15 | - Qualcomm 7 Gen1: 4x(1+3) Cortex-A710, 4x Cortex-A510 https://twitter.com/Kurnalsalts/status/1776276658426966068 16 | - Qualcomm 8 Gen2: 1x Cortex-X3, 2x Cortex-A715, 2x Cortex-A710, 3x Cortex-A510 https://twitter.com/Kurnalsalts/status/1705935348893905147 https://kurnal.xlog.app/SM8550 17 | - Qualcomm 8 Gen3: 1x Cortex-X4, 5x(2+3) Cortex-A720, 2x Cortex-A520 https://twitter.com/Kurnalsalts/status/1776276658426966068 https://twitter.com/Kurnalsalts/status/1776201339615514721 18 | - Qualcomm 845: https://kurnal.xlog.app/SDM845 19 | - Qualcomm 888: https://twitter.com/Kurnalsalts/status/1727744875305832710 20 | - Qualcomm 8s Gen3: https://t.bilibili.com/916817677665697796 https://twitter.com/Kurnalsalts/status/1776276658426966068 https://twitter.com/Kurnalsalts/status/1776201339615514721 21 | - Qualcomm X Elite: https://tieba.baidu.com/p/9194576062 https://chipwise.tech/our-portfolio/snapdragon-x-elite/ 22 | - Qualcomm 8 Elite: https://x.com/Kurnalsalts/status/1848700612181168601 23 | - Samsung Exynos 2100: https://twitter.com/Kurnalsalts/status/1784620815474135151 24 | - Samsung Exynos 2200: https://twitter.com/Kurnalsalts/status/1785040012188471347 https://www.bilibili.com/opus/942039658357850146 25 | - Samsung Exynos 2300: https://www.bilibili.com/opus/942039658357850146 26 | - Samsung Exynos 2400: https://twitter.com/Kurnalsalts/status/1785252470408773986 https://www.bilibili.com/opus/942039658357850146 27 | -------------------------------------------------------------------------------- /docs/firestorm.d2: -------------------------------------------------------------------------------- 1 | cpu : Apple M1 Firestorm CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: JamesAslan, Chips and Cheese 5 | # Source: jiegec, see /data/firestorm/ras.csv 6 | l1btb: 1024-entry L1 BTB, 1 cycle latency 7 | 8 | # Source: JamesAslan 9 | # Source: jiegec, see hw.perflevel0.l1icachesize in /data/firestorm/cache.txt 10 | l2btb: 192KB L1 IC as L2 BTB, 3 cycle latency 11 | 12 | # Source: jiegec, see /data/firestorm/ras.csv 13 | ras: 50-entry RAS 14 | } 15 | 16 | # Coupled Frontend 17 | l1ic: L1 IC { 18 | # Source: Dougall Johnson 19 | # Source: jiegec, see hw.perflevel0.l1icachesize in /data/firestorm/cache.txt 20 | l1ic: 192KB L1 IC 21 | } 22 | 23 | bp -> l1ic 24 | 25 | # Source: Dougall Johnson 26 | decode: 8-way Decode 27 | # Source: jiegec 28 | l1ic -> decode: 16 inst/cycle 29 | decode -> bp 30 | 31 | # Source: Dougall Johnson 32 | rename: 8-way Rename 33 | decode -> rename 34 | } 35 | 36 | backend: Backend { 37 | # Source: Dougall Johnson 38 | # Source: jiegec, ~325 39 | rob: ~330-entry Coalesced ROB 40 | 41 | # Source: Dougall Johnson 42 | # Source: jiegec, ~620 43 | rename: ~623-entry Rename History 44 | 45 | rf: Register File { 46 | # Source: Dougall Johnson 47 | irf: ~380-entry Integer Register File 48 | 49 | # Source: Dougall Johnson 50 | vrf: ~432-entry 128b Vector Register File 51 | 52 | # Source: Dougall Johnson 53 | flagsrf: ~128-entry Flags Register File 54 | } 55 | 56 | # Source: Dougall Johnson 57 | # Source: jiegec, 158(=12+12+24+26+16+12+28+28) sched size for alu 58 | # Source: jiegec, 133(~=134=24+26+16+12+28+28) sched w/o nsq size for alu 59 | # Source: jiegec, 68(12+28+28) sched size for imul 60 | # Source: jiegec, 56(28+28) sched w/o nsq size for imul 61 | dispatch1: 12-entry ALU Dispatch Queue \#1 62 | 63 | # Source: Dougall Johnson 64 | sched1: 24-entry Scheduler \#1 65 | 66 | # Source: Dougall Johnson 67 | pipe1: Pipe \#1 { 68 | ALU 69 | FLAGS 70 | B/BL/ADR 71 | MOV NZCV 72 | MRS 73 | } 74 | dispatch1 -> sched1 -> rf.irf -> pipe1 75 | 76 | # Source: Dougall Johnson 77 | sched2: 26-entry Scheduler \#2 78 | 79 | # Source: Dougall Johnson 80 | pipe2: Pipe \#2 { 81 | ALU 82 | FLAGS 83 | B/BL/ADR 84 | MOV NZCV 85 | PTRAUTH 86 | BR/BLR 87 | } 88 | dispatch1 -> sched2 -> rf.irf -> pipe2 89 | 90 | # Source: Dougall Johnson 91 | sched3: 16-entry Scheduler \#3 92 | 93 | # Source: Dougall Johnson 94 | pipe3: Pipe \#3 { 95 | ALU 96 | FLAGS 97 | FROM FP 98 | } 99 | dispatch1 -> sched3 -> rf.irf -> pipe3 100 | 101 | # Source: Dougall Johnson 102 | dispatch2: 12-entry ALU Dispatch Queue \#2 103 | 104 | # Source: Dougall Johnson 105 | sched4: 12-entry Scheduler \#4 106 | 107 | # Source: Dougall Johnson 108 | pipe4: Pipe \#4 { 109 | ALU 110 | FROM FP 111 | } 112 | dispatch2 -> sched4 -> rf.irf -> pipe4 113 | 114 | # Source: Dougall Johnson 115 | sched5: 28-entry Scheduler \#5 116 | 117 | # Source: Dougall Johnson 118 | pipe5: Pipe \#5 { 119 | ALU 120 | MUL 121 | DIV 122 | } 123 | dispatch2 -> sched5 -> rf.irf -> pipe5 124 | 125 | # Source: Dougall Johnson 126 | sched6: 28-entry Scheduler \#6 127 | 128 | # Source: Dougall Johnson 129 | pipe6: Pipe \#6 { 130 | ALU 131 | MUL 132 | MADD 133 | BFM 134 | CRC 135 | } 136 | dispatch2 -> sched6 -> rf.irf -> pipe6 137 | 138 | # Source: Dougall Johnson 139 | # Source: jiegec, 58(10+48) sched size for load & store 140 | # Source: jiegec, 48 sched size for load & store 141 | dispatch3: 10-entry Memory Dispatch Queue \#3 142 | 143 | # Source: Dougall Johnson 144 | sched7: 48-entry Memory Scheduler \#7 145 | 146 | # Source: Dougall Johnson 147 | pipe7: Pipe \#7 { 148 | STORE 149 | AMX 150 | } 151 | dispatch3 -> sched7 -> rf.irf -> pipe7 152 | 153 | # Source: Dougall Johnson 154 | pipe8: Pipe \#8 { 155 | LOAD 156 | STORE 157 | AMX 158 | } 159 | dispatch3 -> sched7 -> rf.irf -> pipe8 160 | 161 | # Source: Dougall Johnson 162 | pipe9: Pipe \#9 { 163 | LOAD 164 | } 165 | dispatch3 -> sched7 -> rf.irf -> pipe9 166 | 167 | # Source: Dougall Johnson 168 | pipe10: Pipe \#10 { 169 | LOAD 170 | } 171 | dispatch3 -> sched7 -> rf.irf -> pipe10 172 | 173 | lsu: LSU { 174 | # Source: Dougall Johnson 175 | # See https://dougallj.wordpress.com/2021/04/08/apple-m1-load-and-store-queue-measurements/ 176 | # Source: jiegec, see /data/firestorm/lsu.csv: 177 | # load: spike at 130(pattern 4) and 188(pattern5, 188=130+48(scheduler)+10(dispatch queue)) 178 | 130-entry Load Queue 179 | # store: spike at 107(pattern 2 & 6, 107=60+48(scheduler)-1) and 118(pattern 7, 60+48(scheduler)+10(dispatch queue)) 180 | # 60 was computed by 118 - 48(scheduler) - 10(dispatch queue) 181 | 60-entry Store Queue 182 | # Source: jiegec 183 | 2 Load Pipe 184 | 1 Store Pipe 185 | 1 Load/Store Pipe 186 | # Source: jiegec 187 | 4 cycle load to use latency 188 | 3 cycle load to use latency in pointer chasing 189 | } 190 | 191 | pipe7 -> lsu 192 | pipe8 -> lsu 193 | pipe9 -> lsu 194 | pipe10 -> lsu 195 | 196 | rob -> dispatch1 197 | rob -> dispatch2 198 | rob -> dispatch3 199 | 200 | # Source: Dougall Johnson 201 | # Source: jiegec, 156(=12+36+36+36+36) sched size for fp 202 | # Source: jiegec, 143(~=144=36+36+36+36) sched w/o nsq size for fp 203 | dispatch4: 12-entry FP Dispatch Queue \#4 204 | 205 | # Source: Dougall Johnson 206 | sched8: 36-entry Scheduler \#8 207 | 208 | # Source: Dougall Johnson 209 | pipe11: Pipe \#11 { 210 | FP/SIMD 211 | } 212 | dispatch4 -> sched8 -> rf.vrf -> pipe11 213 | 214 | # Source: Dougall Johnson 215 | sched9: 36-entry Scheduler \#9 216 | 217 | # Source: Dougall Johnson 218 | pipe12: Pipe \#12 { 219 | FP/SIMD 220 | } 221 | dispatch4 -> sched9 -> rf.vrf -> pipe12 222 | 223 | # Source: Dougall Johnson 224 | sched10: 36-entry Scheduler \#10 225 | 226 | # Source: Dougall Johnson 227 | pipe13: Pipe \#13 { 228 | FP/SIMD 229 | FCSEL 230 | TO INT 231 | } 232 | dispatch4 -> sched10 -> rf.vrf -> pipe13 233 | 234 | # Source: Dougall Johnson 235 | sched11: 36-entry Scheduler \#11 236 | 237 | # Source: Dougall Johnson 238 | pipe14: Pipe \#14 { 239 | FP/SIMD 240 | FCSEL 241 | TO INT 242 | DIV/RECP 243 | SQRT/SHA 244 | JCVTZS 245 | } 246 | dispatch4 -> sched11 -> rf.vrf -> pipe14 247 | rob -> dispatch4 248 | } 249 | frontend.rename -> backend.rob 250 | frontend.rename -> backend.rename 251 | 252 | mem: Memory { 253 | l1: L1 DC { 254 | # Source: Anandtech 255 | l1dtlb: 256-entry L1 DTLB 256 | 257 | # Source: Dougall Johnson 258 | # Source: jiegec, see hw.perflevel0.l1dcachesize in /data/firestorm/cache.txt 259 | l1dc: 128KB 8-way L1DC 260 | } 261 | 262 | l2: L2 { 263 | # Source: Anandtech 264 | l2tlb: 3072-entry L2 TLB 265 | 266 | # Source: Dougall Johnson 267 | l2dc: 12MB L2 Cache per 4-Core cluster 268 | } 269 | l1 -> l2 270 | } 271 | frontend.l1ic -> mem.l2 272 | backend.lsu -> mem.l1 273 | 274 | info: |md 275 | Drawn by Jiajie Chen @jiegec 276 | 277 | Based on data from Chips and Cheese, Dougall Johnson, JamesAslan and Anandtech 278 | | 279 | } -------------------------------------------------------------------------------- /docs/firestorm.md: -------------------------------------------------------------------------------- 1 | # Apple M1 P-core aka Firestorm 2 | 3 | ![](./firestorm.svg) 4 | 5 | References: 6 | 7 | - [Apple Microarchitecture Research by Dougall Johnson](https://dougallj.github.io/applecpu/firestorm.html) 8 | - [不为人知的角落,Apple M2 的小小努力(其一) - JamesAslan](https://zhuanlan.zhihu.com/p/662561990) 9 | - [Apple Announces The Apple Silicon M1: Ditching x86 - What to Expect, Based on A14 - Anandtech](https://www.anandtech.com/show/16226/apple-silicon-m1-a14-deep-dive) 10 | - [Exploration of Apple CPUs](https://github.com/name99-org/AArch64-Explore) 11 | - [Apple M1 Icestorm 微架构评测(上):重铸小核荣光](https://zhuanlan.zhihu.com/p/611213899) 12 | - [Apple M1 Icestorm 微架构(下):重铸小核荣光](https://zhuanlan.zhihu.com/p/613097964) 13 | - [苹果的黑魔法?Apple M1 的栈操作消除(上)](https://zhuanlan.zhihu.com/p/595582920) 14 | - [苹果的黑魔法?(下)Apple M1 的栈操作消除](https://zhuanlan.zhihu.com/p/600349467) 15 | - [Apple Firestorm/Icestorm CPU microarchitecture docs](https://github.com/dougallj/applecpu) 16 | - [The 2020 Mac Mini Unleashed: Putting Apple Silicon M1 To The Test](https://www.anandtech.com/show/16252/mac-mini-apple-m1-tested) 17 | -------------------------------------------------------------------------------- /docs/golden_cove.md: -------------------------------------------------------------------------------- 1 | # Intel Alder Lake P-core aka Golden Cove 2 | 3 | ![](./golden_cove.svg) 4 | 5 | References: 6 | 7 | - [Popping the Hood on Golden Cove](https://chipsandcheese.com/2021/12/02/popping-the-hood-on-golden-cove/) 8 | - [Intel Alder Lake CPU Architectures](https://ieeexplore.ieee.org/document/9747991) 9 | - [Golden Cove](https://en.wikipedia.org/wiki/Golden_Cove) 10 | - [Golden Cove’s Vector Register File: Checking with Official (SPR) Data](https://chipsandcheese.com/2023/01/15/golden-coves-vector-register-file-checking-with-official-spr-data/) 11 | - [4th Gen Intel Xeon Scalable Sapphire Rapids Leaps Forward](https://www.servethehome.com/4th-gen-intel-xeon-scalable-sapphire-rapids-leaps-forward/7/) 12 | - [Intel Details Golden Cove: Next-Generation Big Core For Client and Server SoCs](https://fuse.wikichip.org/news/6111/intel-details-golden-cove-next-generation-big-core-for-client-and-server-socs/) 13 | - [Sapphire Rapids: Golden Cove Hits Servers](https://chipsandcheese.com/2023/03/12/a-peek-at-sapphire-rapids/) 14 | - [Golden Cove’s Lopsided Vector Register File](https://chipsandcheese.com/2022/12/25/golden-coves-lopsided-vector-register-file/) 15 | - [Alder Lake Architecture on Hot Chips 33](https://hc33.hotchips.org/assets/program/conference/day1/HC2021.C1.1%20Intel%20Efraim%20Rotem.pdf) 16 | - [Sapphire Rapids on Hot Chips 33](https://hc33.hotchips.org/assets/program/conference/day1/HC2021.C1.4%20Intel%20Arijit.pdf) 17 | - Intel 64 and IA-32 Architectures Optimization Reference Manual Volume 1 18 | - [Sapphire Rapids Dieshot](https://www.bilibili.com/video/BV1nb421J7jy/) 19 | -------------------------------------------------------------------------------- /docs/gracemont.d2: -------------------------------------------------------------------------------- 1 | cpu : Intel Alder Lake E-core Gracemont CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Chips and Cheese 5 | l1btb: 1024-entry L1 BTB 6 | 7 | # Source: Chips and Cheese, Intel 8 | # Intel: "he NLP is backed by the second predictor that includes a 5K 9 | # entry target array combined with path-based information to make 10 | # predictions and verify target addresses in three cycles." 11 | l2btb: 5120-way L2 BTB, 3 cycle latency 12 | 13 | # Source: Chips and Cheese 14 | ind: >=512-entry Indirect Target Array 15 | 16 | # Source: Intel 17 | bw: 32B/cycle prediction bandwidth 18 | } 19 | 20 | l1ic: L1 IC { 21 | # Source: Chips and Cheese 22 | itlb: 64-entry fully associative ITLB 23 | 24 | # Source: Chips and Cheese, Intel 25 | # Intel: "Larger 64KB Instruction Cache with dual 32B reads (32B read per 26 | # fetch cluster)." 27 | l1ic: 64KB 8-way L1 IC, 2x32 B/cycle 28 | } 29 | 30 | # Source: Chips and Cheese 31 | fq: 2x Fetch Queue 32 | bp -> fq 33 | fq -> l1ic 34 | 35 | iq1: Instruction Queue \#1 36 | l1ic -> iq1 37 | 38 | # Source: Chips and Cheese 39 | decode1: 3-way Decode \#1 40 | iq1 -> decode1 41 | 42 | # Source: Chips and Cheese 43 | uop1: UOP Queue \#1 44 | decode1 -> uop1 45 | 46 | iq2: Instruction Queue \#2 47 | l1ic -> iq2 48 | 49 | # Source: Chips and Cheese 50 | decode2: 3-way Decode \#2 51 | iq2 -> decode2 52 | 53 | # Source: Chips and Cheese 54 | uop2: UOP Queue \#2 55 | decode2 -> uop2 56 | 57 | # Source: Chips and Cheese, Intel 58 | rename: 5-way Rename { 59 | Move Elimination 60 | Zero Idiom 61 | } 62 | uop1 -> rename 63 | uop2 -> rename 64 | } 65 | 66 | backend: Backend { 67 | # Source: Chips and Cheese, Intel 68 | # Intel: "Retirement can be up to eight instructions per cycle for the 69 | # 256-entry retirement buffer." 70 | rob: 256-entry ROB, 8-wide retirement 71 | 72 | # Source: Chips and Cheese 73 | bob: 116-taken-entry 126-not-taken-entry Branch Order Buffer 74 | 75 | rf: Register File { 76 | # Source: Chips and Cheese 77 | irf: 214-entry Integer Register File 78 | 79 | # Source: Chips and Cheese 80 | vrf: 207-entry 128b Vector Register File 81 | } 82 | 83 | # Source: Chips and Cheese 84 | sched1: 15-entry ALU Scheduler \#1 85 | 86 | # Source: Chips and Cheese, Intel 87 | pipe1: Port 0 { 88 | grid-columns: 1 89 | ALU 90 | SHIFT 91 | } 92 | rob -> sched1 -> rf.irf -> pipe1 93 | 94 | # Source: Chips and Cheese 95 | sched2: 16-entry ALU Scheduler \#2 96 | 97 | # Source: Chips and Cheese 98 | pipe2: Port 1 { 99 | grid-columns: 1 100 | ALU 101 | SHIFT 102 | INT MUL 103 | INT DIV 104 | } 105 | rob -> sched2 -> rf.irf -> pipe2 106 | 107 | # Source: Chips and Cheese 108 | sched3: 16-entry ALU Scheduler \#3 109 | 110 | # Source: Chips and Cheese, Intel 111 | pipe3: Port 2 { 112 | grid-columns: 1 113 | ALU 114 | SHIFT 115 | INT MUL 116 | INT DIV 117 | PDEP 118 | } 119 | rob -> sched3 -> rf.irf -> pipe3 120 | 121 | # Source: Chips and Cheese 122 | sched4: 15-entry ALU Scheduler \#4 123 | 124 | # Source: Chips and Cheese, Intel 125 | pipe4: Port 3 { 126 | grid-columns: 1 127 | ALU 128 | SHIFT 129 | } 130 | rob -> sched4 -> rf.irf -> pipe4 131 | 132 | # Source: Chips and Cheese 133 | # Intel: "The fifth integer reservation station holds jumps and store data 134 | # operations. This structure is banked and can schedule two uops of each 135 | # type every cycle; two store data on ports 08 and 09, and two jumps on 136 | # ports 30 and 31" 137 | sched5: 42-entry Branch/Store Data Scheduler \#5 138 | 139 | # Source: Chips and Cheese, Intel 140 | pipe5: Port 30 { 141 | JMP/Branch 142 | } 143 | rob -> sched5 -> rf.irf -> pipe5 144 | 145 | # Source: Chips and Cheese, Intel 146 | pipe6: Port 31 { 147 | JMP/Branch 148 | } 149 | rob -> sched5 -> rf.irf -> pipe6 150 | 151 | # new Chips and Cheese new post (Meteor Lake’s E-Cores: Crestmont Makes Incremental Progress) 152 | # the scheduler organization was different: 153 | # "Intel’s optimization guide says Crestmont and Gracemont serve store data 154 | # and jump operations with the same queue. I missed this in 2021 because 155 | # that possibility didn’t cross my mind." 156 | # Source: Chips and Cheese, Intel 157 | pipe7: Port 12 { 158 | Store AGU 159 | } 160 | rob -> sched5 -> rf.irf -> pipe7 161 | 162 | # Source: Chips and Cheese, Intel 163 | pipe8: Port 13 { 164 | Store AGU 165 | } 166 | rob -> sched5 -> rf.irf -> pipe8 167 | 168 | # Source: Chips and Cheese 169 | nsq1: 22-entry AGU Non-Scheduling Queue \#1 170 | 171 | # Source: Chips and Cheese 172 | sched6: 22-entry AGU Scheduler \#6 173 | 174 | # Source: Chips and Cheese, Intel 175 | pipe9: Port 10 { 176 | Load AGU 177 | } 178 | rob -> nsq1 -> sched6 -> rf.irf -> pipe9 179 | 180 | # Source: Chips and Cheese, Intel 181 | pipe10: Port 11 { 182 | Load AGU 183 | } 184 | rob -> nsq1 -> sched6 -> rf.irf -> pipe10 185 | 186 | # Source: Chips and Cheese, Intel 187 | pipe11: Port 8 { 188 | Store Data 189 | } 190 | rob -> nsq1 -> sched6 -> rf.irf -> pipe11 191 | 192 | # Source: Chips and Cheese, Intel 193 | pipe12: Port 9 { 194 | Store Data 195 | } 196 | rob -> nsq1 -> sched6 -> rf.irf -> pipe12 197 | 198 | lsu: LSU { 199 | # Source: Chips and Cheese 200 | 80-entry Load Queue 201 | 50-entry Store Queue 202 | 203 | # Source: jiegec 204 | 4 cycle load to use latency 205 | 3 cycle load to use latency in pointer chasing 206 | } 207 | 208 | pipe7 -> lsu 209 | pipe8 -> lsu 210 | pipe9 -> lsu 211 | pipe10 -> lsu 212 | pipe11 -> lsu 213 | pipe12 -> lsu 214 | 215 | # Source: Chips and Cheese 216 | nsq2: 56-entry FP Non-Scheduling Queue \#2 217 | 218 | # Source: Chips and Cheese 219 | sched7: 18-entry FStore Scheduler \#7 220 | 221 | # Source: Chips and Cheese, Intel 222 | pipe13: Port 28 { 223 | FP Store Data 224 | } 225 | rob -> nsq2 -> sched7 -> rf.vrf -> pipe13 226 | 227 | # Source: Chips and Cheese, Intel 228 | pipe14: Port 29 { 229 | FP Store Data 230 | } 231 | rob -> nsq2 -> sched7 -> rf.vrf -> pipe14 232 | 233 | # Source: Chips and Cheese 234 | sched8: 35-entry FP/Vector Scheduler \#8 235 | 236 | # Source: Chips and Cheese, Intel 237 | pipe15: Port 20 { 238 | grid-columns: 1 239 | INT Vec ALU 240 | INT Vec MUL 241 | FMA 242 | FADD 243 | FMUL 244 | AES 245 | FDIV 246 | SHA 247 | } 248 | rob -> nsq2 -> sched8 -> rf.vrf -> pipe15 249 | 250 | # Source: Chips and Cheese, Intel 251 | pipe16: Port 21 { 252 | grid-columns: 1 253 | INT Vec ALU 254 | FMA 255 | FADD 256 | FMUL 257 | AES 258 | } 259 | rob -> nsq2 -> sched8 -> rf.vrf -> pipe16 260 | 261 | # Source: Chips and Cheese, Intel 262 | pipe17: Port 22 { 263 | INT Vec ALU 264 | } 265 | rob -> nsq2 -> sched8 -> rf.vrf -> pipe17 266 | } 267 | 268 | frontend.rename -> backend.rob 269 | frontend.rename -> backend.bob 270 | 271 | mem: Memory { 272 | l1: L1 DC { 273 | # Source: Chips and Cheese 274 | # Chips and Cheese: 48-entry 275 | # Intel: 32-entry 276 | l1dtlb: 32-entry fully associative L1 DTLB 277 | 278 | # Source: Chips and Cheese 279 | # Intel: "4-cycle load-to-use latency." 280 | l1dc: 32KB 8-way L1DC, 4 cycle load-to-use latency 281 | } 282 | 283 | l2: L2 { 284 | # Source: Chips and Cheese, Intel 285 | # Intel: "The main STLB is 2048 entries 4-way set associative and caches 286 | # 4KB and 2MB translations. Additionally, Gracemont microarchitecture has 287 | # an 8-entry fully associative structure for GB translations" 288 | l2tlb: 2048-entry 4-way 4K/2M/4M page, 8-entry fully associative 1G page L2 TLB 289 | 290 | # Source: Chips and Cheese, Intel 291 | # Intel: "The L2 cache delivers 64 bytes of data per cycle at a latency of 292 | # 17 cycles, and that bandwidth is shared among four cores." 293 | l2dc: 2MB 16-way Shared L2 Cache, 17 cycle latency 294 | 295 | # Source: Intel 296 | # Intel: "Pipelined Page Miss Handler capable of handling 4 concurrent 297 | # page walks." 298 | ptw: 4 page table walkers 299 | } 300 | 301 | # Source: Chips and Cheese, Intel 302 | l1 -> l2: 64B/cycle shared among 4 cores 303 | 304 | l3: L3 { 305 | # Source: Chips and Cheese 306 | l3dc: 30MB 12-way L3 Cache 307 | } 308 | l2 -> l3 309 | } 310 | frontend.l1ic -> mem.l2 311 | backend.lsu -> mem.l1 312 | 313 | info: |md 314 | Drawn by Jiajie Chen @jiegec 315 | 316 | Based on data from Chips and Cheese, Intel 317 | | 318 | } -------------------------------------------------------------------------------- /docs/gracemont.md: -------------------------------------------------------------------------------- 1 | # Intel Alder Lake E-core aka Gracemont 2 | 3 | ![](./gracemont.svg) 4 | 5 | References: 6 | 7 | - [Gracemont: Revenge of the Atom Cores](https://chipsandcheese.com/2021/12/21/gracemont-revenge-of-the-atom-cores/) 8 | - [Intel Alder Lake CPU Architectures](https://ieeexplore.ieee.org/document/9747991) 9 | - [Intel’s Gracemont Small Core Eclipses Last-Gen Big Core Performance](https://fuse.wikichip.org/news/6102/intels-gracemont-small-core-eclipses-last-gen-big-core-performance/) 10 | - [Meteor Lake’s E-Cores: Crestmont Makes Incremental Progress](https://chipsandcheese.com/2024/05/13/meteor-lakes-e-cores-crestmont-makes-incremental-progress/) 11 | - Intel 64 and IA-32 Architectures Optimization Reference Manual Volume 1 12 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # CPU Microarchitecture Diagrams 2 | 3 | Microarchitecture diagrams of several CPUs. 4 | 5 | Major microarchitectures of CPU vendors: 6 | 7 | 1. AMD: Zen 1 -> Zen 2 -> Zen 3 -> Zen 4 -> Zen 5 8 | 2. ARM: 9 | - Cortex-A73 10 | - Cortex-A75 11 | - Cortex-A76/Neoverse-N1 12 | - Cortex-A77 13 | - Cortex-A78/Cortex-X1/Neoverse-V1 14 | - Cortex-A510/Cortex-A710/Cortex-X2/Neoverse-N2 15 | - Cortex-A715/Cortex-X3/Neoverse-V2 16 | - Cortex-A520/Cortex-A720/Cortex-X4/Neoverse-V3 17 | - Cortex-A725/Cortex-X925/Neoverse-N3 18 | 3. Apple: 19 | - M1/A14 (Firestorm + Icestorm) 20 | - M2/A15 (Avalanche + Blizzard) 21 | - A16 (Everest + Sawtooth) 22 | - M3 23 | - A17 24 | - M4 25 | - A18 26 | 4. Intel: 27 | - Skylake 28 | - Sunny Cove (Ice Lake) 29 | - Golden Cove + Gracemont (Alder Lake/Sapphire Rapids) -> Raptor Cove + Gracemont (Raptor Lake/Emerald Rapids) 30 | - Redwood Cove + Crestmont (Meteor Lake/Granite Rapids/Sierra Forest) 31 | - Lion Cove + Skymont (Lunar Lake/Arrow Lake/Clearwater Forest) 32 | 5. Qualcomm: Oryon 33 | -------------------------------------------------------------------------------- /docs/lion_cove.d2: -------------------------------------------------------------------------------- 1 | cpu: Intel Lunar Lake P-core Lion Cove CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor 4 | 5 | # Source: David Huang 6 | l1ic: 64KB L1 IC 7 | 8 | bp -> l1ic 9 | 10 | iq: Instruction Queue 11 | l1ic -> iq 12 | 13 | # Source: Intel 14 | # "DECODE (8-WIDE)" 15 | decode: 8-way Decode 16 | iq -> decode 17 | 18 | uopc: UOP Cache 19 | decode -> uopc 20 | bp -> uopc 21 | 22 | # Source: Intel 23 | # "uOP CACHE (12-WIDE)" 24 | uop: UOP Queue 25 | uopc -> uop: 12 UOP/cycle 26 | decode -> uop: 8 UOP/cycle 27 | 28 | # Source: Intel 29 | # "6 -> 8 wide alloc/rename" 30 | rename: 8-way Rename { 31 | Move Elimination 32 | Zero Idiom 33 | } 34 | uop -> rename 35 | } 36 | 37 | backend: Backend { 38 | # Source: Intel 39 | # "8 -> 12 wide retirement" and "512 -> 576 deep instruction window" in 40 | rob: 576-entry ROB, 12 wide retirement 41 | 42 | rf: Register File { 43 | irf: Integer Register File 44 | 45 | flagsrf: Flags Register File 46 | 47 | vrf: FP/Vector Register File 48 | } 49 | 50 | sched1: Integer Scheduler 51 | 52 | # Source: Intel 53 | pipe1: Port 0 { 54 | grid-columns: 1 55 | ALU 56 | JMP 57 | } 58 | rob -> sched1 -> rf -> pipe1 59 | 60 | # Source: Intel 61 | pipe2: Port 1 { 62 | grid-columns: 1 63 | ALU 64 | SHIFT 65 | MUL 66 | } 67 | rob -> sched1 -> rf -> pipe2 68 | 69 | # Source: Intel 70 | pipe3: Port 2 { 71 | grid-columns: 1 72 | ALU 73 | JMP 74 | } 75 | rob -> sched1 -> rf -> pipe3 76 | 77 | # Source: Intel 78 | pipe4: Port 3 { 79 | grid-columns: 1 80 | ALU 81 | SHIFT 82 | MUL 83 | } 84 | rob -> sched1 -> rf -> pipe4 85 | 86 | # Source: Intel 87 | pipe5: Port 4 { 88 | grid-columns: 1 89 | ALU 90 | JMP 91 | } 92 | rob -> sched1 -> rf -> pipe5 93 | 94 | # Source: Intel 95 | pipe6: Port 5 { 96 | grid-columns: 1 97 | ALU 98 | SHIFT 99 | MUL 100 | } 101 | rob -> sched1 -> rf -> pipe6 102 | 103 | sched2: Memory Scheduler 104 | 105 | # Source: Intel 106 | pipe7: Port 20 { 107 | Load AGU 108 | } 109 | rob -> sched2 -> rf -> pipe7 110 | 111 | # Source: Intel 112 | pipe8: Port 25 { 113 | Store AGU 114 | } 115 | rob -> sched2 -> rf -> pipe8 116 | 117 | # Source: Intel 118 | pipe9: Port 21 { 119 | Load AGU 120 | } 121 | rob -> sched2 -> rf -> pipe9 122 | 123 | # Source: Intel 124 | pipe10: Port 26 { 125 | Store AGU 126 | } 127 | rob -> sched2 -> rf -> pipe10 128 | 129 | # Source: Intel 130 | pipe11: Port 22 { 131 | Load AGU 132 | } 133 | rob -> sched2 -> rf -> pipe11 134 | 135 | # Source: Intel 136 | pipe12: Port 27 { 137 | Store AGU 138 | } 139 | rob -> sched3 -> rf -> pipe12 140 | 141 | sched3: Store Data Scheduler 142 | 143 | # Source: Intel 144 | pipe13: Port 10 { 145 | Store Data 146 | } 147 | rob -> sched3 -> rf -> pipe13 148 | 149 | # Source: Intel 150 | pipe14: Port 11 { 151 | Store Data 152 | } 153 | rob -> sched3 -> rf -> pipe14 154 | 155 | lsu: LSU { 156 | lq: Load Queue 157 | sq: Store Queue 158 | } 159 | 160 | pipe7 -> lsu 161 | pipe8 -> lsu 162 | pipe9 -> lsu 163 | pipe10 -> lsu 164 | pipe11 -> lsu 165 | pipe12 -> lsu 166 | pipe13 -> lsu 167 | pipe14 -> lsu 168 | 169 | sched4: Vector Scheduler 170 | 171 | # Source: Intel 172 | pipe15: Port V0 { 173 | grid-columns: 1 174 | FMA 175 | ALU 176 | SHIFT 177 | } 178 | rob -> sched4 -> rf -> pipe15 179 | 180 | # Source: Intel 181 | pipe16: Port V1 { 182 | grid-columns: 1 183 | FADD 184 | ALU 185 | SHUF 186 | FPDIV 187 | } 188 | rob -> sched4 -> rf -> pipe16 189 | 190 | # Source: Intel 191 | pipe17: Port V2 { 192 | grid-columns: 1 193 | FMA 194 | ALU 195 | SHIFT 196 | } 197 | rob -> sched4 -> rf -> pipe17 198 | 199 | # Source: Intel 200 | pipe18: Port V3 { 201 | grid-columns: 1 202 | FADD 203 | ALU 204 | SHUF 205 | FPDIV 206 | } 207 | rob -> sched4 -> rf -> pipe18 208 | } 209 | frontend.rename -> backend.rob 210 | 211 | mem: Memory { 212 | l0: L0 DC { 213 | # Source: Intel 214 | # "96 -> 128 pages DTLB" 215 | l1dtlb: 128-entry L1 DTLB 216 | 217 | # Source: Intel 218 | l0dc: 48KB L0 DC 219 | 4 cycle load to use latency 220 | 3x256b or 2x512b read per cycle 221 | } 222 | 223 | l1: L1 DC { 224 | # Source: Intel 225 | 192KB L1 DC 226 | 9 cycle load to use latency 227 | 2x64B read per cycle 228 | } 229 | 230 | l0 -> l1 231 | 232 | l2: L2 Cache { 233 | # Source: Intel 234 | l2dc: 2.5MB/3MB L2 Cache 235 | 17 cycle load to use latency 236 | 2x64B read per cycle 237 | } 238 | 239 | l1 -> l2 240 | } 241 | frontend.l1ic -> mem.l2 242 | backend.lsu -> mem.l0 243 | 244 | info: |md 245 | Drawn by Jiajie Chen @jiegec 246 | 247 | Based on data from Chips and Cheese, Intel, David Huang and Anandtech 248 | | 249 | } 250 | -------------------------------------------------------------------------------- /docs/lion_cove.md: -------------------------------------------------------------------------------- 1 | # Intel Lunar Lake P-core aka Lion Cove 2 | 3 | ![](./lion_cove.svg) 4 | 5 | References: 6 | 7 | - [An Interview with Intel’s Arik Gihon about Lunar Lake at Hot Chips 2024](https://chipsandcheese.com/2024/09/02/an-interview-with-intels-arik-gihon-about-lunar-lake-at-hot-chips-2024/) 8 | - [Intel’s Lion Cove Architecture Preview](https://chipsandcheese.com/2024/06/03/intels-lion-cove-architecture-preview/) 9 | - [2024 Intel Tech Tour: Next Gen P-core-The Lion Cove Microarchitecture](https://www.intel.com/content/www/us/en/content-details/824430/2024-intel-tech-tour-next-gen-p-core-the-lion-cove-microarchitecture.html) 10 | - [2024 Intel Tech Tour: LNL Architecture Session Highlights](https://www.intel.com/content/www/us/en/content-details/824443/2024-intel-tech-tour-lnl-architecture-session-highlights.html) 11 | - [2024 Intel Technology Tour Keynote](https://www.intel.com/content/www/us/en/content-details/824444/2024-intel-technology-tour-keynote.html) 12 | - [月光下的新探索:Lunar Lake CPU (Lion Cove / Skymont) 微架构测试](https://blog.hjc.im/lunar-lake-cpu-uarch-review.html) 13 | - [Lion Cove: Intel’s P-Core Roars](https://chipsandcheese.com/2024/09/27/lion-cove-intels-p-core-roars/) 14 | - [Intel Announces Core Ultra 200S Arrow Lake CPUs](https://www.phoronix.com/review/intel-core-ultra-200-arrow-lake) 15 | - [Intel® Core Ultra Desktop Processors Launch Briefing](https://download.intel.com/newsroom/2024/client-computing/Intel-Core-Ultra-200S-Series-Presentation.pdf) 16 | -------------------------------------------------------------------------------- /docs/m3_pcore.d2: -------------------------------------------------------------------------------- 1 | cpu : Apple M3 P-core CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | } 5 | 6 | # Coupled Frontend 7 | l1ic: L1 IC { 8 | # Source: Geekerwan 9 | l1ic: 192KB L1 IC 10 | } 11 | 12 | bp -> l1ic 13 | 14 | # Source: Geekerwan 15 | decode: 9-way Decode 16 | l1ic -> decode 17 | decode -> bp 18 | 19 | # Source: Geekerwan 20 | rename: 9-way Rename 21 | decode -> rename 22 | } 23 | 24 | backend: Backend { 25 | # Source: Geekerwan 26 | rob: 321-entry Coalesced ROB 27 | 28 | rf: Register File { 29 | # Source: Geekerwan 30 | irf: ~368-entry Integer Register File 31 | 32 | # Source: Geekerwan 33 | vrf: ~423-entry 128b Vector Register File 34 | } 35 | 36 | # Source: Geekerwan 37 | dispatch1: 12-entry Dispatch Queue \#1 38 | 39 | # Source: Geekerwan 40 | sched1: 78-entry Scheduler \#1 41 | 42 | # Source: Geekerwan 43 | pipe1: Pipe \#1 { 44 | ALU 45 | BR 46 | FLAGS 47 | ADR 48 | } 49 | dispatch1 -> sched1 -> rf.irf -> pipe1 50 | 51 | # Source: Geekerwan 52 | pipe2: Pipe \#2 { 53 | ALU 54 | BR 55 | FLAGS 56 | ADR 57 | } 58 | dispatch1 -> sched1 -> rf.irf -> pipe2 59 | 60 | # Source: Geekerwan 61 | pipe3: Pipe \#3 { 62 | ALU 63 | FLAGS 64 | ADR 65 | } 66 | dispatch1 -> sched1 -> rf.irf -> pipe3 67 | 68 | # Source: Geekerwan 69 | pipe4: Pipe \#4 { 70 | ALU 71 | FLAGS 72 | ADR 73 | } 74 | dispatch1 -> sched1 -> rf.irf -> pipe4 75 | 76 | # Source: Geekerwan 77 | dispatch2: 12-entry Dispatch Queue \#2 78 | 79 | # Source: Geekerwan 80 | sched2: 36-entry Scheduler \#2 81 | 82 | # Source: Geekerwan 83 | pipe5: Pipe \#5 { 84 | ALU 85 | ADR 86 | } 87 | dispatch2 -> sched2 -> rf.irf -> pipe5 88 | 89 | # Source: Geekerwan 90 | pipe6: Pipe \#6 { 91 | ALU 92 | } 93 | dispatch2 -> sched2 -> rf.irf -> pipe6 94 | 95 | # Source: Geekerwan 96 | sched3: 26-entry Scheduler \#3 97 | 98 | # Source: Geekerwan 99 | pipe7: Pipe \#7 { 100 | ALU 101 | MUL 102 | DIV 103 | } 104 | dispatch2 -> sched3 -> rf.irf -> pipe7 105 | 106 | # Source: Geekerwan 107 | sched4: 26-entry Scheduler \#4 108 | 109 | # Source: Geekerwan 110 | pipe8: Pipe \#8 { 111 | ALU 112 | MUL 113 | BFM 114 | MADD 115 | } 116 | dispatch2 -> sched4 -> rf.irf -> pipe8 117 | 118 | # Source: Geekerwan 119 | dispatch3: 10-entry Dispatch Queue \#3 120 | 121 | # Source: Geekerwan 122 | sched5: 60-entry Scheduler \#5 123 | 124 | # Source: Geekerwan 125 | pipe9: Pipe \#9 { 126 | STORE 127 | } 128 | dispatch3 -> sched5 -> rf.irf -> pipe9 129 | 130 | # Source: Geekerwan 131 | pipe10: Pipe \#10 { 132 | LOAD 133 | STORE 134 | } 135 | dispatch3 -> sched5 -> rf.irf -> pipe10 136 | 137 | # Source: Geekerwan 138 | pipe11: Pipe \#11 { 139 | LOAD 140 | } 141 | dispatch3 -> sched5 -> rf.irf -> pipe11 142 | 143 | # Source: Geekerwan 144 | pipe12: Pipe \#12 { 145 | LOAD 146 | } 147 | dispatch3 -> sched5 -> rf.irf -> pipe12 148 | 149 | lsu: LSU { 150 | # Source: Geekerwan 151 | 142-entry Load Queue 152 | 63-entry Store Queue 153 | } 154 | 155 | pipe9 -> lsu 156 | pipe10 -> lsu 157 | pipe11 -> lsu 158 | pipe12 -> lsu 159 | 160 | rob -> dispatch1 161 | rob -> dispatch2 162 | rob -> dispatch3 163 | 164 | # Source: Geekerwan 165 | dispatch4: 12-entry Dispatch Queue \#4 166 | 167 | # Source: Geekerwan 168 | sched6: 41-entry Scheduler \#6 169 | 170 | # Source: Geekerwan 171 | pipe13: Pipe \#13 { 172 | FP 173 | SIMD 174 | } 175 | dispatch4 -> sched6 -> rf.vrf -> pipe13 176 | 177 | # Source: Geekerwan 178 | sched7: 41-entry Scheduler \#7 179 | 180 | # Source: Geekerwan 181 | pipe14: Pipe \#14 { 182 | FP 183 | SIMD 184 | } 185 | dispatch4 -> sched7 -> rf.vrf -> pipe14 186 | 187 | # Source: Geekerwan 188 | sched8: 41-entry Scheduler \#8 189 | 190 | # Source: Geekerwan 191 | pipe15: Pipe \#15 { 192 | FP 193 | SIMD 194 | TO INT 195 | } 196 | dispatch4 -> sched8 -> rf.vrf -> pipe15 197 | 198 | # Source: Geekerwan 199 | sched9: 41-entry Scheduler \#9 200 | 201 | # Source: Geekerwan 202 | pipe16: Pipe \#16 { 203 | FP 204 | SIMD 205 | FSQRT 206 | FCSEL 207 | TO INT 208 | } 209 | dispatch4 -> sched9 -> rf.vrf -> pipe16 210 | rob -> dispatch4 211 | } 212 | frontend.rename -> backend.rob 213 | 214 | mem: Memory { 215 | l1: L1 DC { 216 | # Source: Geekerwan 217 | l1dc: 128KB L1DC 218 | } 219 | } 220 | backend.lsu -> mem.l1 221 | 222 | info: |md 223 | Drawn by Jiajie Chen @jiegec 224 | 225 | Based on data from Geekerwan 226 | | 227 | } -------------------------------------------------------------------------------- /docs/m3_pcore.md: -------------------------------------------------------------------------------- 1 | # Apple M3 P-core 2 | 3 | ![](./m3_pcore.svg) 4 | 5 | References: 6 | 7 | - [苹果 M4 性能分析:尽力了,但芯片工艺快到头了!](https://www.bilibili.com/video/BV1NJ4m1w7zk/) 8 | - [MacBook Air M3 简评:性能、续航如何?M1 用户要换吗?](https://www.bilibili.com/video/BV1cw4m1o76r/) 9 | -------------------------------------------------------------------------------- /docs/m4_pcore.d2: -------------------------------------------------------------------------------- 1 | cpu : Apple M4 P-core CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | } 5 | 6 | # Coupled Frontend 7 | l1ic: L1 IC { 8 | # Source: Geekerwan 9 | l1ic: 192KB L1 IC 10 | } 11 | 12 | bp -> l1ic 13 | 14 | # Source: Geekerwan 15 | decode: 10-way Decode 16 | l1ic -> decode 17 | decode -> bp 18 | 19 | # Source: Geekerwan 20 | rename: 10-way Rename 21 | decode -> rename 22 | } 23 | 24 | backend: Backend { 25 | # Source: Geekerwan 26 | rob: 361-entry Coalesced ROB 27 | 28 | rf: Register File { 29 | # Source: Geekerwan 30 | irf: ~446-entry Integer Register File 31 | 32 | # Source: Geekerwan 33 | vrf: ~378-entry 128b Vector Register File 34 | } 35 | 36 | # Source: Geekerwan 37 | dispatch1: 15-entry Dispatch Queue \#1 38 | 39 | # Source: Geekerwan 40 | sched1: 87-entry Scheduler \#1 41 | 42 | # Source: Geekerwan 43 | pipe1: Pipe \#1 { 44 | ALU 45 | BR 46 | FLAGS 47 | ADR 48 | } 49 | dispatch1 -> sched1 -> rf.irf -> pipe1 50 | 51 | # Source: Geekerwan 52 | pipe2: Pipe \#2 { 53 | ALU 54 | BR 55 | FLAGS 56 | ADR 57 | } 58 | dispatch1 -> sched1 -> rf.irf -> pipe2 59 | 60 | # Source: Geekerwan 61 | pipe3: Pipe \#3 { 62 | ALU 63 | FLAGS 64 | ADR 65 | } 66 | dispatch1 -> sched1 -> rf.irf -> pipe3 67 | 68 | # Source: Geekerwan 69 | pipe4: Pipe \#4 { 70 | ALU 71 | FLAGS 72 | ADR 73 | } 74 | dispatch1 -> sched1 -> rf.irf -> pipe4 75 | 76 | # Source: Geekerwan 77 | dispatch2: 15-entry Dispatch Queue \#2 78 | 79 | # Source: Geekerwan 80 | sched2: 19-entry Scheduler \#2 81 | 82 | # Source: Geekerwan 83 | pipe5: Pipe \#5 { 84 | ALU 85 | ADR 86 | } 87 | dispatch2 -> sched2 -> rf.irf -> pipe5 88 | 89 | # Source: Geekerwan 90 | sched3: 29-entry Scheduler \#3 91 | 92 | # Source: Geekerwan 93 | pipe6: Pipe \#6 { 94 | ALU 95 | } 96 | dispatch2 -> sched3 -> rf.irf -> pipe6 97 | 98 | # Source: Geekerwan 99 | sched4: 17-entry Scheduler \#4 100 | 101 | # Source: Geekerwan 102 | pipe7: Pipe \#7 { 103 | ALU 104 | MUL 105 | DIV 106 | } 107 | dispatch2 -> sched4 -> rf.irf -> pipe7 108 | 109 | # Source: Geekerwan 110 | sched5: 23-entry Scheduler \#5 111 | 112 | # Source: Geekerwan 113 | pipe8: Pipe \#8 { 114 | ALU 115 | MUL 116 | BFM 117 | MADD 118 | } 119 | dispatch2 -> sched5 -> rf.irf -> pipe8 120 | 121 | # Source: Geekerwan 122 | dispatch3: 15-entry Dispatch Queue \#3 123 | 124 | # Source: Geekerwan 125 | sched6: 72-entry Scheduler \#6 126 | 127 | # Source: Geekerwan 128 | pipe9: Pipe \#9 { 129 | STORE 130 | } 131 | dispatch3 -> sched6 -> rf.irf -> pipe9 132 | 133 | # Source: Geekerwan 134 | pipe10: Pipe \#10 { 135 | LOAD 136 | STORE 137 | } 138 | dispatch3 -> sched6 -> rf.irf -> pipe10 139 | 140 | # Source: Geekerwan 141 | pipe11: Pipe \#11 { 142 | LOAD 143 | } 144 | dispatch3 -> sched6 -> rf.irf -> pipe11 145 | 146 | # Source: Geekerwan 147 | pipe12: Pipe \#12 { 148 | LOAD 149 | } 150 | dispatch3 -> sched6 -> rf.irf -> pipe12 151 | 152 | lsu: LSU { 153 | # Source: Geekerwan 154 | 131-entry Load Queue 155 | 72-entry Store Queue 156 | } 157 | 158 | pipe9 -> lsu 159 | pipe10 -> lsu 160 | pipe11 -> lsu 161 | pipe12 -> lsu 162 | 163 | rob -> dispatch1 164 | rob -> dispatch2 165 | rob -> dispatch3 166 | 167 | # Source: Geekerwan 168 | dispatch4: 23-entry Dispatch Queue \#4 169 | 170 | # Source: Geekerwan 171 | sched7: 61-entry Scheduler \#7 172 | 173 | # Source: Geekerwan 174 | pipe13: Pipe \#13 { 175 | FP 176 | SIMD 177 | } 178 | dispatch4 -> sched7 -> rf.vrf -> pipe13 179 | 180 | # Source: Geekerwan 181 | sched8: 61-entry Scheduler \#8 182 | 183 | # Source: Geekerwan 184 | pipe14: Pipe \#14 { 185 | FP 186 | SIMD 187 | } 188 | dispatch4 -> sched8 -> rf.vrf -> pipe14 189 | 190 | # Source: Geekerwan 191 | sched9: 61-entry Scheduler \#9 192 | 193 | # Source: Geekerwan 194 | pipe15: Pipe \#15 { 195 | FP 196 | SIMD 197 | FCSEL 198 | TO INT 199 | } 200 | dispatch4 -> sched9 -> rf.vrf -> pipe15 201 | 202 | # Source: Geekerwan 203 | sched10: 61-entry Scheduler \#10 204 | 205 | # Source: Geekerwan 206 | pipe16: Pipe \#16 { 207 | FP 208 | SIMD 209 | DIV 210 | FSQRT 211 | FCSEL 212 | TO INT 213 | } 214 | dispatch4 -> sched10 -> rf.vrf -> pipe16 215 | rob -> dispatch4 216 | } 217 | frontend.rename -> backend.rob 218 | 219 | mem: Memory { 220 | l1: L1 DC { 221 | # Source: Geekerwan 222 | l1dc: 128KB L1DC 223 | } 224 | } 225 | backend.lsu -> mem.l1 226 | 227 | info: |md 228 | Drawn by Jiajie Chen @jiegec 229 | 230 | Based on data from Geekerwan 231 | | 232 | } -------------------------------------------------------------------------------- /docs/m4_pcore.md: -------------------------------------------------------------------------------- 1 | # Apple M4 P-core 2 | 3 | ![](./m4_pcore.svg) 4 | 5 | References: 6 | 7 | - [苹果 M4 性能分析:尽力了,但芯片工艺快到头了!](https://www.bilibili.com/video/BV1NJ4m1w7zk/) 8 | - [iPhone 16系列性能分析:A18兄弟挺强的!](https://www.bilibili.com/video/BV178tEeVEMD/) 9 | -------------------------------------------------------------------------------- /docs/main.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import math 3 | 4 | all_data = pandas.read_csv("docs/uarch.csv") 5 | 6 | 7 | def define_env(env): 8 | @env.macro 9 | def bp_comparison(): 10 | data = all_data 11 | # filter columns 12 | data = data[ 13 | [ 14 | "uArch", 15 | "L1 BTB", 16 | "L2 BTB", 17 | "L3 BTB", 18 | "ITA", 19 | "RAS", 20 | ] 21 | ] 22 | # drop integer index 23 | data = data.set_index("uArch") 24 | # handle empty fields 25 | data = data.fillna("") 26 | # convert to integer 27 | fields = ["ITA"] 28 | for index, row in data.iterrows(): 29 | for field in fields: 30 | if row[field] != "": 31 | data.loc[index, field] = str(int(row[field])) 32 | return data.to_markdown() 33 | 34 | @env.macro 35 | def l1ic_comparison(): 36 | data = all_data 37 | # filter columns 38 | data = data[ 39 | [ 40 | "uArch", 41 | "L1 IC", 42 | "L1 ITLB", 43 | "L2 ITLB", 44 | ] 45 | ] 46 | # drop integer index 47 | data = data.set_index("uArch") 48 | # handle empty fields 49 | data = data.fillna("") 50 | # convert to integer 51 | fields = ["L1 ITLB", "L2 ITLB"] 52 | for index, row in data.iterrows(): 53 | for field in fields: 54 | if row[field] != "": 55 | data.loc[index, field] = str(int(row[field])) 56 | return data.to_markdown() 57 | 58 | @env.macro 59 | def rob_comparison(): 60 | data = all_data 61 | # filter columns 62 | data = data[ 63 | [ 64 | "uArch", 65 | "ROB", 66 | ] 67 | ] 68 | # drop integer index 69 | data = data.set_index("uArch") 70 | return data.to_markdown() 71 | 72 | @env.macro 73 | def eu_comparison(): 74 | data = all_data 75 | # filter columns 76 | data = data[ 77 | [ 78 | "uArch", 79 | "ALU units", 80 | "Branch units", 81 | "FP/Vec units", 82 | ] 83 | ] 84 | # drop integer index 85 | data = data.set_index("uArch") 86 | return data.to_markdown() 87 | 88 | @env.macro 89 | def cortex_x_comparison(): 90 | data = all_data 91 | # only consider cortex x cores 92 | data = data[data["uArch"].str.startswith("ARM Cortex-X")] 93 | # filter columns 94 | data = data[ 95 | [ 96 | "uArch", 97 | "ALU units", 98 | "Branch units", 99 | "Load/Store pipes", 100 | "Load-only pipes", 101 | "Store-only pipes", 102 | "ROB", 103 | "Decode width", 104 | "Rename width", 105 | ] 106 | ] 107 | data["Max Load"] = data["Load/Store pipes"].astype(int) + data["Load-only pipes"].astype(int) 108 | data["Max Store"] = data["Load/Store pipes"].astype(int) + data["Store-only pipes"].astype(int) 109 | data["Max Load+Store"] = ( 110 | data["Load/Store pipes"].astype(int) 111 | + data["Load-only pipes"].astype(int) 112 | + data["Store-only pipes"].astype(int) 113 | ) 114 | 115 | # reduce column width 116 | for index, row in data.iterrows(): 117 | data.loc[index, "uArch"] = row["uArch"].removeprefix("ARM ") 118 | # drop integer index 119 | data = data.set_index("uArch") 120 | data = data.transpose() 121 | # compute maximum load 122 | return data.to_markdown() 123 | 124 | @env.macro 125 | def firestorm_oryon_comparison(): 126 | data = all_data 127 | # only consider firestorm/oryon 128 | data = data[data["uArch"].isin(["Apple Firestorm", "Qualcomm Oryon"])] 129 | # filter columns 130 | data = data[ 131 | [ 132 | "uArch", 133 | "L1 BTB", 134 | "L2 BTB", 135 | "RAS", 136 | "L1 IC", 137 | "Decode width", 138 | "Rename width", 139 | "ROB", 140 | "Branch units", 141 | "ALU units", 142 | "FP/Vec units", 143 | "Load/Store pipes", 144 | "Load-only pipes", 145 | "Store-only pipes", 146 | ] 147 | ] 148 | # drop integer index 149 | data = data.set_index("uArch") 150 | fields = ["L1 BTB", "Rename width"] 151 | for index, row in data.iterrows(): 152 | for field in fields: 153 | if row[field] != "": 154 | data.loc[index, field] = str(row[field]) 155 | data = data.transpose() 156 | return data.to_markdown() 157 | -------------------------------------------------------------------------------- /docs/neoverse_n2.md: -------------------------------------------------------------------------------- 1 | # ARM Neoverse N2 2 | 3 | ![](./neoverse_n2.svg) 4 | 5 | References: 6 | 7 | - [Arm Neoverse N2: Arm’s 2nd generation high performance infrastructure CPUs and system IPs](https://hc33.hotchips.org/assets/program/conference/day1/20210818_Hotchips_NeoverseN2.pdf) 8 | -------------------------------------------------------------------------------- /docs/neoverse_v2.d2: -------------------------------------------------------------------------------- 1 | cpu : ARM Neoverse-V2 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: ARM 5 | two predicted branches per cycle 6 | # "Compared to Neoverse V1: 10x larger nano BTB, split main BTB into two 7 | # levels with 50% more entries" 8 | two-level branch target buffer 9 | # "Compared to Neoverse V1: 2x larger table with 2-way associativity and 10 | # longer history" 11 | 2-way 8 table TAGE direction predictor with staged output 12 | } 13 | 14 | l1ic: L1 IC { 15 | # Source: ARM 16 | # "64kB, 4-way set-associative L1 instruction cache" 17 | l1ic: 64KB 4-way L1 IC 18 | } 19 | 20 | # Source: ARM 21 | # "Doubled from 16 to 32 entries" 22 | fq: 32-entry Fetch Queue 23 | bp -> fq 24 | fq -> l1ic 25 | 26 | # Source: ARM 27 | # "Increased Decode Queue from 16 to 24 entries" 28 | iq: 24-entry Instruction Queue 29 | l1ic -> iq 30 | 31 | # Source: ARM 32 | # "Increased decoder lanes from 5 to 6" 33 | decode: 6-way Decode 34 | iq -> decode 35 | 36 | # Source: ARM 37 | mopc: MOP Cache 38 | decode -> mopc 39 | bp -> mopc 40 | 41 | mop: MOP Queue 42 | 43 | # Source: ARM 44 | mopc -> mop: 8 MOP/cycle 45 | decode -> mop: 6 MOP/cycle 46 | 47 | # Source: ARM 48 | rename: 8-way Rename { 49 | Zero Idiom 50 | } 51 | mop -> rename 52 | } 53 | 54 | backend: Backend { 55 | rob: ROB 56 | 57 | rf: Register File { 58 | irf: Integer Register File 59 | 60 | flagsrf: Flags Register File 61 | 62 | vrf: FP/Vector Register File 63 | } 64 | 65 | # Source: ARM 66 | # "SX/MX: Increased from 20 to 22 entries" 67 | sched1: 22-entry ALU Scheduler 68 | 69 | # Source: ARM 70 | pipe1: Pipe \#1 { 71 | ALU 72 | } 73 | rob -> sched1 -> rf -> pipe1 74 | 75 | # Source: ARM 76 | pipe2: Pipe \#2 { 77 | Branch 78 | } 79 | rob -> sched1 -> rf -> pipe2 80 | 81 | # Source: ARM 82 | # "SX/MX: Increased from 20 to 22 entries" 83 | sched2: 22-entry ALU Scheduler 84 | 85 | # Source: ARM 86 | pipe3: Pipe \#3 { 87 | ALU 88 | } 89 | rob -> sched2 -> rf -> pipe3 90 | 91 | # Source: ARM 92 | pipe4: Pipe \#4 { 93 | Branch 94 | } 95 | rob -> sched2 -> rf -> pipe4 96 | 97 | # Source: ARM 98 | # "SX/MX: Increased from 20 to 22 entries" 99 | sched3: 22-entry ALU Scheduler 100 | 101 | # Source: ARM 102 | pipe5: Pipe \#5 { 103 | ALU 104 | SHIFT + ALU 105 | MUL/IMAC/DIV/CRC/SPR 106 | } 107 | rob -> sched3 -> rf -> pipe5 108 | 109 | # Source: ARM 110 | # "SX/MX: Increased from 20 to 22 entries" 111 | sched4: 22-entry ALU Scheduler 112 | 113 | # Source: ARM 114 | pipe6: Pipe \#6 { 115 | ALU 116 | SHIFT + ALU 117 | MUL/IMAC 118 | } 119 | rob -> sched4 -> rf -> pipe6 120 | 121 | # Source: ARM 122 | # "SX/MX: Increased from 20 to 22 entries" 123 | sched5: 22-entry ALU Scheduler 124 | 125 | # Source: ARM 126 | pipe7: Pipe \#7 { 127 | ALU 128 | } 129 | rob -> sched5 -> rf -> pipe7 130 | 131 | # Source: ARM 132 | # "SX/MX: Increased from 20 to 22 entries" 133 | sched6: 22-entry ALU Scheduler 134 | 135 | # Source: ARM 136 | pipe8: Pipe \#8 { 137 | ALU 138 | } 139 | rob -> sched6 -> rf -> pipe8 140 | 141 | # Source: ARM 142 | # "VX: Increased from 20 to 28 entries" 143 | sched7: 28-entry FP/Vector Scheduler 144 | 145 | # Source: ARM 146 | pipe9: Pipe \#9 { 147 | FP/Vector 148 | } 149 | rob -> sched7 -> rf -> pipe9 150 | 151 | # Source: ARM 152 | pipe10: Pipe \#10 { 153 | FP/Vector 154 | } 155 | rob -> sched7 -> rf -> pipe10 156 | 157 | # Source: ARM 158 | # "VX: Increased from 20 to 28 entries" 159 | sched8: 28-entry FP/Vector Scheduler 160 | 161 | # Source: ARM 162 | pipe11: Pipe \#11 { 163 | FP/Vector 164 | } 165 | rob -> sched8 -> rf -> pipe11 166 | 167 | # Source: ARM 168 | pipe12: Pipe \#12 { 169 | FP/Vector 170 | } 171 | rob -> sched8 -> rf -> pipe12 172 | 173 | # Source: ARM 174 | sched9: Memory Scheduler 175 | 176 | # Source: ARM 177 | pipe13: Pipe \#13 { 178 | Load/Store 179 | } 180 | rob -> sched9 -> rf -> pipe13 181 | 182 | # Source: ARM 183 | sched10: Memory Scheduler 184 | 185 | # Source: ARM 186 | pipe14: Pipe \#14 { 187 | Load/Store 188 | } 189 | rob -> sched10 -> rf -> pipe14 190 | 191 | # Source: ARM 192 | sched11: Memory Scheduler 193 | 194 | # Source: ARM 195 | pipe15: Pipe \#15 { 196 | Load 197 | } 198 | rob -> sched11 -> rf -> pipe15 199 | 200 | lsu: LSU { 201 | Load Queue 202 | Store Queue 203 | } 204 | 205 | pipe13 -> lsu 206 | pipe14 -> lsu 207 | pipe15 -> lsu 208 | } 209 | frontend.rename -> backend.rob 210 | 211 | mem: Memory { 212 | l1: L1 DC { 213 | # Source: ARM 214 | # "64kB 4-way set associative Dcache" 215 | l1dc: 64KB 4-way L1 DC 216 | } 217 | 218 | l2: L2 { 219 | } 220 | 221 | l1 -> l2 222 | 223 | l3: L3 { 224 | } 225 | l2 -> l3 226 | } 227 | frontend.l1ic -> mem.l2 228 | backend.lsu -> mem.l1 229 | 230 | info: |md 231 | Drawn by Jiajie Chen @jiegec 232 | 233 | Based on data from ARM and Chips and Cheese 234 | | 235 | } 236 | -------------------------------------------------------------------------------- /docs/neoverse_v2.md: -------------------------------------------------------------------------------- 1 | # ARM Neoverse V2 2 | 3 | ![](./neoverse_v2.svg) 4 | 5 | References: 6 | 7 | - [Arm Neoverse V2 platform: Leadership Performance and Power Efficiency for Next-Generation Cloud Computing, ML and HPC Workloads](https://hc2023.hotchips.org/assets/program/conference/day1/CPU1/HC2023.Arm.MagnusBruce.v04.FINAL.pdf) 8 | - [Hot Chips 2023: Arm’s Neoverse V2](https://chipsandcheese.com/p/hot-chips-2023-arms-neoverse-v2) 9 | -------------------------------------------------------------------------------- /docs/oryon.d2: -------------------------------------------------------------------------------- 1 | cpu : Qualcomm Oryon CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Chips and Cheese 5 | # Source: Qualcomm, 2Ke+ 6 | l1btb: 2048-entry L1 BTB, 1 cycle latency 7 | 8 | # Source: Chips and Cheese 9 | l2btb: 192KB 6-way L1 IC as L2 BTB, 3 cycle latency 10 | 11 | # Source: Chips and Cheese 12 | ind: 2048-entry Indirect Target Buffer 13 | 14 | # Source: Qualcomm 15 | cond: 80KB Conditional Predictor 16 | 17 | # Source: Qualcomm 18 | indir: 40KB Indirect Predictor 19 | 20 | # Source: Chips and Cheese 21 | # Source: jiegec, 50 22 | # Source: Qualcomm, 50e 23 | ras: 50-entry RAS 24 | } 25 | 26 | # Decoupled Frontend 27 | fq: Fetch Queue 28 | bp -> fq 29 | 30 | l1ic: L1 IC { 31 | # Source: Chips and Cheese 32 | # Source: Qualcomm 33 | itlb: 256-entry 8-way ITLB, 1 cycle latency 34 | 35 | # Source: Chips and Cheese, Anandtech 36 | # Source: Qualcomm 37 | l1ic: 192KB 6-way L1 IC 38 | } 39 | 40 | fq -> l1ic 41 | 42 | iq: Instruction Queue 43 | # Source: Qualcomm 44 | # "Fetches up to 16 instructions per cycle" 45 | l1ic -> iq: 16 inst/cycle 46 | 47 | # Source: Chips and Cheese, Anandtech 48 | decode: 8-way Decode 49 | iq -> decode 50 | 51 | # Source: Chips and Cheese 52 | uop: UOP Queue 53 | decode -> uop 54 | 55 | # Source: Chips and Cheese 56 | rename: 8-way Rename 57 | uop -> rename 58 | } 59 | 60 | backend: Backend { 61 | # Source: Chips and Cheese 62 | # Source: Qualcomm, 650+ 63 | rob: 680-entry ROB, retire 8 op/cycle 64 | 65 | # Source: jiegec 66 | bob: 119-entry ROB 67 | 68 | rf: Register File { 69 | # Source: Chips and Cheese 70 | # Source: Qualcomm, 400+ 71 | irf: 384 + 32-entry Integer Register File 72 | 73 | # Source: Chips and Cheese 74 | # Source: Qualcomm, 400+ 75 | vrf: 384 + 32-entry 128b Vector Register File 76 | 77 | } 78 | 79 | # Source: Chips and Cheese, Anandtech 80 | # Source: Qualcomm, each with a 20e queue 81 | sched1: 20-entry ALU Scheduler \#1 82 | 83 | # Source: Chips and Cheese 84 | pipe1: Pipe \#1 { 85 | ALU 86 | Direct Branch 87 | Indirect Branch 88 | } 89 | sched1 -> rf.irf -> pipe1 90 | 91 | # Source: Chips and Cheese 92 | sched2: 20-entry ALU Scheduler \#2 93 | 94 | # Source: Chips and Cheese 95 | pipe2: Pipe \#2 { 96 | ALU 97 | Direct Branch 98 | } 99 | sched2 -> rf.irf -> pipe2 100 | 101 | # Source: Chips and Cheese 102 | sched3: 20-entry ALU Scheduler \#3 103 | 104 | # Source: Chips and Cheese 105 | pipe3: Pipe \#3 { 106 | ALU 107 | Crypto 108 | CRC 109 | } 110 | sched3 -> rf.irf -> pipe3 111 | 112 | # Source: Chips and Cheese 113 | sched4: 20-entry ALU Scheduler \#4 114 | 115 | # Source: Chips and Cheese 116 | pipe4: Pipe \#4 { 117 | ALU 118 | } 119 | sched4 -> rf.irf -> pipe4 120 | 121 | # Source: Chips and Cheese 122 | sched5: 20-entry ALU Scheduler \#5 123 | 124 | # Source: Chips and Cheese 125 | pipe5: Pipe \#5 { 126 | ALU 127 | INT MUL 128 | I2V 129 | } 130 | sched5 -> rf.irf -> pipe5 131 | 132 | # Source: Chips and Cheese 133 | sched6: 20-entry ALU Scheduler \#6 134 | 135 | # Source: Chips and Cheese 136 | pipe6: Pipe \#6 { 137 | ALU 138 | INT MUL 139 | INT DIV 140 | I2V 141 | } 142 | sched6 -> rf.irf -> pipe6 143 | 144 | # Source: Chips and Cheese, Anandtech 145 | # Source: Qualcomm, each with a 16e queue 146 | sched7: 16-entry Memory Scheduler \#7 147 | 148 | # Source: Chips and Cheese 149 | pipe7: Pipe \#7 { 150 | AGU 151 | Load 152 | Store 153 | } 154 | sched7 -> rf.irf -> pipe7 155 | 156 | # Source: Chips and Cheese 157 | sched8: 16-entry Memory Scheduler \#8 158 | 159 | # Source: Chips and Cheese 160 | pipe8: Pipe \#8 { 161 | AGU 162 | Load 163 | Store 164 | } 165 | sched8 -> rf.irf -> pipe8 166 | 167 | # Source: Chips and Cheese 168 | sched9: 16-entry Memory Scheduler \#9 169 | 170 | # Source: Chips and Cheese 171 | pipe9: Pipe \#9 { 172 | AGU 173 | Load 174 | Store 175 | } 176 | sched9 -> rf.irf -> pipe9 177 | 178 | # Source: Chips and Cheese 179 | sched10: 16-entry Memory Scheduler \#10 180 | 181 | # Source: Chips and Cheese 182 | pipe10: Pipe \#10 { 183 | AGU 184 | Load 185 | Store 186 | } 187 | sched10 -> rf.irf -> pipe10 188 | 189 | lsu: LSU { 190 | # Source: Chips and Cheese, Anandtech, Qualcomm 191 | 192-entry Load Queue 192 | 56-entry Store Queue 193 | # Source: Chips and Cheese 194 | 2 Load Pipe 195 | 2 Load/Store Pipe 196 | # Source: jiegec 197 | 4 cycle load to use latency 198 | 3 cycle load to use latency in pointer chasing 199 | } 200 | 201 | pipe7 -> lsu 202 | pipe8 -> lsu 203 | pipe9 -> lsu 204 | pipe10 -> lsu 205 | 206 | rob -> sched1 207 | rob -> sched2 208 | rob -> sched3 209 | rob -> sched4 210 | rob -> sched5 211 | rob -> sched6 212 | rob -> sched7 213 | rob -> sched8 214 | rob -> sched9 215 | rob -> sched10 216 | 217 | # Source: Chips and Cheese, Anandtech 218 | # Source: Qualcomm, each with a 48e queue 219 | sched11: 48-entry FP Scheduler \#11 220 | 221 | # Source: Chips and Cheese 222 | pipe11: Pipe \#11 { 223 | 128b ALU 224 | NEON 225 | V2I 226 | } 227 | sched11 -> rf.vrf -> pipe11 228 | 229 | # Source: Chips and Cheese 230 | sched12: 48-entry FP Scheduler \#12 231 | 232 | # Source: Chips and Cheese 233 | pipe12: Pipe \#12 { 234 | 128b ALU 235 | NEON 236 | V2I 237 | } 238 | sched12 -> rf.vrf -> pipe12 239 | 240 | # Source: Chips and Cheese 241 | sched13: 48-entry FP Scheduler \#13 242 | 243 | # Source: Chips and Cheese 244 | pipe13: Pipe \#13 { 245 | 128b ALU 246 | NEON 247 | } 248 | sched13 -> rf.vrf -> pipe13 249 | 250 | # Source: Chips and Cheese 251 | sched14: 48-entry FP Scheduler \#14 252 | 253 | # Source: Chips and Cheese 254 | pipe14: Pipe \#14 { 255 | 128b ALU 256 | NEON 257 | FDIV 258 | FSQRT 259 | } 260 | sched14 -> rf.vrf -> pipe14 261 | rob -> sched11 262 | rob -> sched12 263 | rob -> sched13 264 | rob -> sched14 265 | } 266 | frontend.rename -> backend.rob 267 | frontend.rename -> backend.bob 268 | 269 | mem: Memory { 270 | l1: L1 DC { 271 | # Source: Chips and Cheese, Anandtech, Qualcomm 272 | l1dtlb: 224-entry 7-way L1 DTLB, 1 cycle latency 273 | 274 | # Source: Chips and Cheese, Anandtech 275 | l1dc: 96KB 6-way L1DC, 56-entry MSHR 276 | } 277 | 278 | l2: L2 { 279 | # Source: Chips and Cheese 280 | l2tlb: 8192-entry 8-way L2 TLB 281 | 282 | # Source: Chips and Cheese, Anandtech 283 | l2dc: 12MB 12-way L2 Cache per 4-Core Cluster 284 | } 285 | l1 -> l2: 64B/cycle 286 | 287 | # Source: Chips and Cheese, Anandtech 288 | slc: 6MB System Level Cache 289 | l2 -> slc: 32B/cycle 290 | 291 | # Source: Anandtech 292 | dram: LPDDR5x 8448 MT/s, 8x 16b 293 | slc -> dram 294 | } 295 | frontend.l1ic -> mem.l2 296 | backend.lsu -> mem.l1 297 | 298 | info: |md 299 | Drawn by Jiajie Chen @jiegec 300 | 301 | Based on data from Chips and Cheese, Anandtech and Qualcomm 302 | | 303 | } -------------------------------------------------------------------------------- /docs/oryon.md: -------------------------------------------------------------------------------- 1 | # Qualcomm Oryon 2 | 3 | ![](./oryon.svg) 4 | 5 | Oryon-M in Qualcomm 8 Elite: 6 | 7 | 1. 4-wide decode instead of 8 8 | 2. 4 integer pipelines instead of 6 9 | 3. 2 load store pipelines instead of 4 10 | 4. 2 fp/simd pipelines instead of 4 11 | 12 | References: 13 | 14 | - [Qualcomm’s Oryon Core: A Long Time in the Making - Chips and Cheese](https://chipsandcheese.com/2024/07/09/qualcomms-oryon-core-a-long-time-in-the-making/) 15 | - [The Qualcomm Snapdragon X Architecture Deep Dive: Getting To Know Oryon and Adreno X1 - Anandtech](https://www.anandtech.com/show/21445/qualcomm-snapdragon-x-architecture-deep-dive) 16 | - Snapdragon X Series - Architecture Overview 17 | - [高通 X Elite Oryon 微架构评测:走走停停](https://zhuanlan.zhihu.com/p/704707254) 18 | - [Snapdragon X Elite](https://www.qualcomm.com/products/mobile/snapdragon/laptops-and-tablets/snapdragon-x-elite) 19 | - [Qualcomm Oryon CPU](https://www.qualcomm.com/products/technology/processors/oryon) 20 | - [Qualcomm’s Oryon LLVM Patches](https://chipsandcheese.com/2024/05/15/qualcomms-oryon-llvm-patches/) 21 | - [高通自研 PC 芯片 X Elite 实测:真能干翻苹果英特尔?](https://www.bilibili.com/video/BV1Ue41197Qb/) 22 | - [太贵了,它没你想的那么美好!高通骁龙 X Elite 78-100 笔记本详细评测](https://www.bilibili.com/video/BV1z1421r7dZ/) 23 | - [AArch64SchedOryon.td in LLVM](https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AArch64/AArch64SchedOryon.td) 24 | - [Hot Chips 2024: Qualcomm’s Oryon Core](https://chipsandcheese.com/2024/08/26/hot-chips-2024-qualcomms-oryon-core/) 25 | - [骁龙 8 Elite 首发评测:一加13能效有多好?](https://www.bilibili.com/video/BV1xvysYwEcX/) 26 | - [高通 X Elite 深度分析:年度最自信 CPU](https://b23.tv/iL38AXz) 27 | -------------------------------------------------------------------------------- /docs/p550.d2: -------------------------------------------------------------------------------- 1 | cpu : SiFive P550 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Chips and Cheese 5 | ras: 16-entry RAS 6 | 7 | # Source: Chips and Cheese 8 | l1btb: 32-entry L1 BTB 9 | } 10 | 11 | l1ic: L1 IC { 12 | # Source: Chips and Cheese 13 | l1ic: 32KB 4-way L1 IC 14 | 15 | # Source: Chips and Cheese 16 | l1itlb: L1 ITLB 17 | } 18 | 19 | bp -> l1ic 20 | 21 | # Source: Chips and Cheese 22 | decode: 3-way Decode 23 | l1ic -> decode: 12 bytes/cycle 24 | decode -> bp 25 | 26 | # Source: Chips and Cheese 27 | rename: 3-way Rename 28 | decode -> rename 29 | } 30 | 31 | backend: Backend { 32 | # Source: Chips and Cheese 33 | rob: 96-entry ROB 34 | 35 | rf: Register File { 36 | # Source: Chips and Cheese 37 | irf: 128-entry Integer Register File 38 | 39 | # Source: Chips and Cheese 40 | frf: 119-entry FP Register File 41 | } 42 | 43 | # Source: Chips and Cheese 44 | sched1: Scheduler \#1 45 | 46 | # Source: Chips and Cheese 47 | pipe1: Pipe \#1 { 48 | ALU 49 | Branch 50 | } 51 | rob -> sched1 -> rf -> pipe1 52 | 53 | # Source: Chips and Cheese 54 | sched2: Scheduler \#2 55 | 56 | # Source: Chips and Cheese 57 | pipe2: Pipe \#2 { 58 | ALU 59 | } 60 | rob -> sched2 -> rf -> pipe2 61 | 62 | # Source: Chips and Cheese 63 | sched3: Scheduler \#3 64 | 65 | # Source: Chips and Cheese 66 | pipe3: Pipe \#3 { 67 | ALU 68 | MUL 69 | } 70 | rob -> sched3 -> rf -> pipe3 71 | 72 | # Source: Chips and Cheese 73 | sched4: Scheduler \#4 74 | 75 | # Source: Chips and Cheese 76 | pipe4: Pipe \#4 { 77 | Load AGU 78 | } 79 | rob -> sched4 -> rf -> pipe4 80 | 81 | # Source: Chips and Cheese 82 | sched5: Scheduler \#5 83 | 84 | # Source: Chips and Cheese 85 | pipe5: Pipe \#5 { 86 | Store AGU 87 | } 88 | rob -> sched5 -> rf -> pipe5 89 | 90 | # Source: Chips and Cheese 91 | sched6: Scheduler \#6 92 | 93 | # Source: Chips and Cheese 94 | pipe6: Pipe \#6 { 95 | FMA 96 | } 97 | rob -> sched6 -> rf -> pipe6 98 | 99 | # Source: Chips and Cheese 100 | pipe7: Pipe \#7 { 101 | FMA 102 | } 103 | rob -> sched6 -> rf -> pipe7 104 | 105 | lsu: LSU { 106 | # Source: Chips and Cheese 107 | 20-entry Load Queue 108 | 15-entry Store Queue 109 | } 110 | 111 | pipe4 -> lsu 112 | pipe5 -> lsu 113 | } 114 | frontend.rename -> backend.rob 115 | 116 | mem: Memory { 117 | l1: L1 DC { 118 | # Source: Chips and Cheese 119 | l1dc: 32KB 4-way L1DC 120 | l1dtlb: 32-entry L1 DTLB 121 | l2tlb: L2 TLB 122 | } 123 | 124 | # Source: Chips and Cheese 125 | l2: 256KB 8-way L2 DC 126 | l1 -> l2 127 | } 128 | backend.lsu -> mem.l1 129 | 130 | info: |md 131 | Drawn by Jiajie Chen @jiegec 132 | 133 | Based on data from Chips and Cheese 134 | | 135 | } -------------------------------------------------------------------------------- /docs/p550.md: -------------------------------------------------------------------------------- 1 | # SiFive P550 2 | 3 | ![](./p550.svg) 4 | 5 | References: 6 | 7 | - [Inside SiFive’s P550 Microarchitecture](https://chipsandcheese.com/p/inside-sifives-p550-microarchitecture) 8 | - [A RISC-V Progress Check: Benchmarking P550 and C910](https://chipsandcheese.com/p/a-risc-v-progress-check-benchmarking) 9 | -------------------------------------------------------------------------------- /docs/p870.d2: -------------------------------------------------------------------------------- 1 | cpu : SiFive P870 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: SiFive 5 | nlp: 1024-entry Next Line Predictor 6 | 7 | # Source: SiFive 8 | ras: 64-entry RAS 9 | 10 | # Source: SiFive 11 | cbp: 16384-entry TAGE Direction Predictor 12 | 13 | # Source: SiFive 14 | ibp: 2560-entry Indirect Predictor 15 | } 16 | 17 | # Coupled Frontend 18 | l1ic: L1 IC { 19 | # Source: SiFive 20 | l1ic: 64KB L1 IC 21 | 22 | # Source: SiFive 23 | l1itlb: 32-entry L1 ITLB 24 | } 25 | 26 | bp -> l1ic 27 | 28 | # Source: SiFive 29 | decode: 6-way Decode 30 | l1ic -> decode: 36 bytes/cycle 31 | decode -> bp 32 | 33 | # Source: SiFive 34 | rename: 6-way Rename 35 | decode -> rename 36 | } 37 | 38 | backend: Backend { 39 | # Source: SiFive 40 | rob: 1120-entry ROB 41 | 42 | rf: Register File { 43 | # Source: SiFive 44 | irf: 228-entry Integer Register File 45 | 46 | # Source: SiFive 47 | frf: 240-entry FP Register File 48 | 49 | # Source: SiFive 50 | vrf: 128-entry Vector Register File 51 | } 52 | 53 | # Source: SiFive 54 | dispatch1: Integer Dispatch Queue \#1 55 | 56 | # Source: SiFive 57 | sched1: 16-entry Scheduler \#1 58 | 59 | # Source: SiFive 60 | pipe1: Pipe \#1 { 61 | ALU 62 | DIV 63 | } 64 | dispatch1 -> sched1 -> rf -> pipe1 65 | 66 | # Source: SiFive 67 | sched2: 16-entry Scheduler \#2 68 | 69 | # Source: SiFive 70 | pipe2: Pipe \#2 { 71 | ALU 72 | MUL 73 | } 74 | dispatch1 -> sched2 -> rf -> pipe2 75 | 76 | # Source: SiFive 77 | sched3: 16-entry Scheduler \#3 78 | 79 | # Source: SiFive 80 | pipe3: Pipe \#3 { 81 | ALU 82 | MUL 83 | } 84 | dispatch1 -> sched3 -> rf -> pipe3 85 | 86 | # Source: SiFive 87 | sched4: 16-entry Scheduler \#4 88 | 89 | # Source: SiFive 90 | pipe4: Pipe \#4 { 91 | ALU 92 | } 93 | dispatch1 -> sched4 -> rf -> pipe4 94 | 95 | # Source: SiFive 96 | sched5: 16-entry Scheduler \#5 97 | 98 | # Source: SiFive 99 | pipe5: Pipe \#5 { 100 | BR 101 | ALU 102 | } 103 | dispatch1 -> sched5 -> rf -> pipe5 104 | 105 | # Source: SiFive 106 | sched6: 16-entry Scheduler \#6 107 | 108 | # Source: SiFive 109 | pipe6: Pipe \#6 { 110 | BR 111 | } 112 | dispatch1 -> sched6 -> rf -> pipe6 113 | 114 | # Source: SiFive 115 | dispatch2: Memory Dispatch Queue \#2 116 | 117 | # Source: SiFive 118 | sched7: 32-entry Scheduler \#7 119 | 120 | # Source: SiFive 121 | pipe7: Pipe \#7 { 122 | AGU 123 | LD 124 | } 125 | dispatch2 -> sched7 -> rf -> pipe7 126 | 127 | # Source: SiFive 128 | pipe8: Pipe \#8 { 129 | AGU 130 | LDST 131 | } 132 | dispatch2 -> sched7 -> rf -> pipe8 133 | 134 | # Source: SiFive 135 | pipe9: Pipe \#9 { 136 | AGU 137 | LDST 138 | } 139 | dispatch2 -> sched7 -> rf -> pipe9 140 | 141 | lsu: LSU { 142 | # Source: SiFive 143 | 48-entry Load Queue 144 | 48-entry Store Queue 145 | 1 Load Pipe 146 | 2 Load/Store pipe 147 | } 148 | 149 | pipe7 -> lsu 150 | pipe8 -> lsu 151 | pipe9 -> lsu 152 | 153 | # Source: SiFive 154 | dispatch3: FP Dispatch Queue \#3 155 | 156 | # Source: SiFive 157 | sched8: 24-entry Scheduler \#8 158 | 159 | # Source: SiFive 160 | pipe10: Pipe \#13 { 161 | FADD 162 | FMUL 163 | FMAC 164 | } 165 | dispatch3 -> sched8 -> rf -> pipe10 166 | 167 | # Source: SiFive 168 | sched9: 24-entry Scheduler \#9 169 | 170 | # Source: SiFive 171 | pipe11: Pipe \#11 { 172 | FADD 173 | FMUL 174 | FMAC 175 | FDIV 176 | FSQRT 177 | } 178 | dispatch3 -> sched9 -> rf -> pipe11 179 | 180 | # Source: SiFive 181 | seq: Vector Sequencer 182 | 183 | # Source: SiFive 184 | dispatch4: Vector Dispatch Queue \#4 185 | 186 | # Source: SiFive 187 | sched10: 16-entry Scheduler \#10 188 | 189 | # Source: SiFive 190 | pipe12: Pipe \#12 { 191 | Vec ADD 192 | Vec MUL 193 | Vec MAC 194 | Vec Crypto 195 | Vec Div 196 | Vec Permute 197 | } 198 | dispatch4 -> sched10 -> rf -> pipe12 199 | 200 | # Source: SiFive 201 | sched11: 16-entry Scheduler \#11 202 | 203 | # Source: SiFive 204 | pipe13: Pipe \#13 { 205 | Vec ADD 206 | Vec MUL 207 | Vec MAC 208 | Vec Crypto 209 | Vec Mask 210 | } 211 | dispatch4 -> sched11 -> rf -> pipe13 212 | 213 | rob -> dispatch1 214 | rob -> dispatch2 215 | rob -> dispatch3 216 | rob -> seq -> dispatch4 217 | } 218 | frontend.rename -> backend.rob 219 | 220 | mem: Memory { 221 | l1: L1 DC { 222 | # Source: SiFive 223 | l1dc: 64KB L1DC 224 | l1dtlb: 64-entry L1 DTLB 225 | l2tlb: 1024-entry L2 TLB 226 | } 227 | } 228 | backend.lsu -> mem.l1 229 | 230 | info: |md 231 | Drawn by Jiajie Chen @jiegec 232 | 233 | Based on data from SiFive 234 | | 235 | } -------------------------------------------------------------------------------- /docs/p870.md: -------------------------------------------------------------------------------- 1 | # SiFive P870 2 | 3 | ![](./p870.svg) 4 | 5 | References: 6 | 7 | - [P870 High-Performance RISC-V Processor](https://hc2023.hotchips.org/assets/program/conference/day1/CPU2/P870%20for%20Hot%20Chips%20-%20FInal.pdf) 8 | -------------------------------------------------------------------------------- /docs/redwood_cove.d2: -------------------------------------------------------------------------------- 1 | cpu: Intel Meteor Lake P-core Redwood Cove CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Chips and Cheese 5 | l1btb: 128-entry L1 BTB, 0 bubble 6 | 7 | # Source: Chips and Cheese 8 | # The figure says another 6K-entry BTB level, but it is not shown in 9 | # testing result 10 | l2btb: 12K-entry L2 BTB, 1 bubble 11 | } 12 | 13 | l1ic: L1 IC { 14 | # Source: Intel 15 | # "Larger instruction cache: 32K→64K." 16 | l1ic: 64KB L1 IC 17 | 18 | # Source: Chips and Cheese 19 | l1itlb: 256-entry 8-way L1 ITLB 20 | } 21 | 22 | bp -> l1ic 23 | 24 | iq: Instruction Queue 25 | l1ic -> iq 26 | 27 | # Source: Intel 28 | decode: 6-way Decode 29 | iq -> decode 30 | 31 | # Source: Chips and Cheese 32 | uopc: 4096-entry UOP Cache 33 | decode -> uopc 34 | bp -> uopc 35 | 36 | # Source: Intel 37 | # "Improved LSD coverage: the IDQ can hold 192 μops per logical processor in 38 | # single-thread mode or 96 μops per thread when SMT is active." 39 | uop: 2x96 UOP Queue 40 | uopc -> uop: 8 UOP/cycle 41 | decode -> uop: 6 UOP/cycle 42 | 43 | # Source: Intel 44 | rename: 6-way Rename { 45 | Move Elimination 46 | Zero Idiom 47 | } 48 | uop -> rename 49 | } 50 | 51 | backend: Backend { 52 | # Source: Intel 53 | # "8 -> 12 wide retirement" and "512 -> 576 deep instruction window" in 54 | # Redwood Cove vs Lion Cove comparison 55 | rob: 512-entry ROB, 8 wide retirement 56 | 57 | # Source: Chips and Cheese 58 | bob: 128-entry Branch Order Buffer 59 | 60 | rf: Register File { 61 | # Source: Chips and Cheese 62 | irf: 280-entry Integer Register File 63 | 64 | flagsrf: Flags Register File 65 | 66 | # Source: Chips and Cheese 67 | vrf: 332-entry FP/Vector Register File 68 | } 69 | 70 | # Source: Chips and Cheese 71 | sched1: 97-entry Unified Math Scheduler 72 | 73 | # Source: Intel 74 | pipe1: Port 0 { 75 | grid-columns: 1 76 | ALU 77 | LEA 78 | Shift 79 | JMP 80 | Vec FMA 81 | Vec ALU 82 | Vec Shift 83 | Vec fpDIV 84 | } 85 | rob -> sched1 -> rf -> pipe1 86 | 87 | # Source: Intel 88 | pipe2: Port 1 { 89 | grid-columns: 1 90 | ALU 91 | LEA 92 | Shift 93 | Int DIV 94 | Vec FMA 95 | Vec ALU 96 | Vec Shift 97 | Vec Shuffle 98 | Vec FADD 99 | } 100 | rob -> sched1 -> rf -> pipe2 101 | 102 | # Source: Intel 103 | pipe3: Port 5 { 104 | grid-columns: 1 105 | ALU 106 | LEA 107 | MulHi 108 | Vec FMA512 109 | Vec ALU 110 | Vec AMX 111 | Vec Shuffle 112 | Vec FADD 113 | } 114 | rob -> sched1 -> rf -> pipe3 115 | 116 | # Source: Intel 117 | pipe4: Port 6 { 118 | grid-columns: 1 119 | ALU 120 | LEA 121 | Shift 122 | JMP 123 | } 124 | rob -> sched1 -> rf -> pipe4 125 | 126 | # Source: Intel 127 | pipe5: Port 11 { 128 | grid-columns: 1 129 | ALU 130 | LEA 131 | } 132 | rob -> sched1 -> rf -> pipe5 133 | 134 | # Source: Chips and Cheese 135 | sched2: 70-entry Load Scheduler 136 | 137 | # Source: Intel 138 | pipe6: Port 2 { 139 | Load AGU 140 | } 141 | rob -> sched2 -> rf -> pipe6 142 | 143 | # Source: Intel 144 | pipe7: Port 3 { 145 | Load AGU 146 | } 147 | rob -> sched2 -> rf -> pipe7 148 | 149 | # Source: Intel 150 | pipe8: Port 10 { 151 | Load AGU 152 | } 153 | rob -> sched2 -> rf -> pipe8 154 | 155 | # Source: Chips and Cheese 156 | sched3: 38-entry Store Scheduler 157 | 158 | # Source: Intel 159 | pipe9: Port 7 { 160 | Store AGU 161 | } 162 | rob -> sched3 -> rf -> pipe9 163 | 164 | # Source: Intel 165 | pipe10: Port 8 { 166 | Store AGU 167 | } 168 | rob -> sched3 -> rf -> pipe10 169 | 170 | # Source: Intel 171 | pipe11: Port 4 { 172 | Store Data 173 | } 174 | rob -> sched3 -> rf -> pipe11 175 | 176 | # Source: Intel 177 | pipe12: Port 9 { 178 | Store Data 179 | } 180 | rob -> sched3 -> rf -> pipe12 181 | 182 | lsu: LSU { 183 | # Source: Chips and Cheese 184 | lq: 192-entry Load Queue 185 | sq: 114-entry Store Queue 186 | 187 | # Source: Intel 188 | # Redwood Cove vs Lion Cove comparison 189 | 5 cycle load to use latency 190 | 3x256b or 2x512b read per cycle 191 | } 192 | 193 | pipe6 -> lsu 194 | pipe7 -> lsu 195 | pipe8 -> lsu 196 | pipe9 -> lsu 197 | pipe10 -> lsu 198 | pipe11 -> lsu 199 | pipe12 -> lsu 200 | } 201 | frontend.rename -> backend.rob 202 | frontend.rename -> backend.bob 203 | 204 | mem: Memory { 205 | l1: L1 DC { 206 | # Source: Intel 207 | # "96 -> 128 pages DTLB" in 208 | # Redwood Cove vs Lion Cove comparison 209 | l1dtlb: 96-entry L1 DTLB 210 | 211 | # Source: Intel 212 | # Redwood Cove vs Lion Cove comparison 213 | l1dc: 48KB L1DC 214 | } 215 | 216 | l2: L2 { 217 | # Source: Chips and Cheese 218 | l2tlb: 2048-entry L2 TLB 219 | 220 | # Source: Intel 221 | # "Mid-level-cache size increased to 2MBs for Client." 222 | l2dc: 2MB L2 (Mid Level) Cache 223 | 224 | # Source: Intel 225 | # "Increased number of outstanding misses (48→64 Deeper MLC miss queues)." 226 | mshr: 64-entry MSHR 227 | 228 | # Source: Intel 229 | 16 cycle load to use latency 230 | # Source: Intel 231 | 2x64B read per cycle 232 | } 233 | 234 | l1 -> l2 235 | } 236 | frontend.l1ic -> mem.l2 237 | backend.lsu -> mem.l1 238 | 239 | info: |md 240 | Drawn by Jiajie Chen @jiegec 241 | 242 | Based on data from Chips and Cheese, Intel and Anandtech 243 | | 244 | } 245 | -------------------------------------------------------------------------------- /docs/redwood_cove.md: -------------------------------------------------------------------------------- 1 | # Intel Meteor Lake P-core aka Redwood Cove 2 | 3 | ![](./redwood_cove.svg) 4 | 5 | References: 6 | 7 | - [Previewing Meteor Lake at CES](https://chipsandcheese.com/2024/01/11/previewing-meteor-lake-at-ces/) 8 | - [Intel Unveils Meteor Lake Architecture: Intel 4 Heralds the Disaggregated Future of Mobile CPUs](https://www.anandtech.com/show/20046/intel-unveils-meteor-lake-architecture-intel-4-heralds-the-disaggregated-future-of-mobile-cpus/2) 9 | - [2023 Intel Tech Tour: Meteor Lake Architecture Overview](https://www.intel.com/content/www/us/en/content-details/788851/2023-intel-tech-tour-meteor-lake-architecture-overview.html) 10 | - [Intel’s Redwood Cove: Baby Steps are Still Steps](https://chipsandcheese.com/2024/09/22/intels-redwood-cove-baby-steps-are-still-steps/) 11 | -------------------------------------------------------------------------------- /docs/skylake.d2: -------------------------------------------------------------------------------- 1 | cpu : Intel Skylake CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | } 5 | 6 | l1ic: L1 IC { 7 | # Source: Wikichip 8 | l1ic: 32KB 8-way L1 IC 9 | } 10 | 11 | bp -> l1ic 12 | 13 | # Source: Wikichip 14 | iq: 2x 25-entry Instruction Queue 15 | l1ic -> iq 16 | 17 | # Source: Wikichip 18 | decode: 4-way Decode 19 | iq -> decode 20 | 21 | # Source: Wikichip 22 | uopc: 1536-entry 8-way UOP Cache 23 | decode -> uopc 24 | bp -> uopc 25 | 26 | # Source: Wikichip 27 | uop: 2x 64-entry UOP Queue 28 | uopc -> uop 29 | decode -> uop 30 | 31 | # Source: Wikichip 32 | rename: Rename { 33 | Move Elimination 34 | Zero Idiom 35 | } 36 | uop -> rename 37 | } 38 | 39 | backend: Backend { 40 | # Source: Wikichip 41 | rob: ROB 42 | 43 | # Source: Wikichip 44 | bob: 48-entry Branch Order Buffer 45 | 46 | rf: "" { 47 | # Source: Wikichip 48 | # Source: jiegec, 141 speculative 49 | irf: 180-entry Integer Register File 50 | 51 | # Source: jiegec, 141 speculative 52 | flagsrf: 141-entry Flags Register File 53 | 54 | # Source: Wikichip 55 | # Source: jiegec, 106 speculative 56 | vrf: 168-entry FP/Vector Register File 57 | } 58 | 59 | # Source: Wikichip 60 | sched1: 97-entry Unified Scheduler 61 | 62 | # Source: Wikichip 63 | pipe1: Port 0 { 64 | grid-columns: 1 65 | ALU 66 | INT DIV 67 | INT Vec ALU 68 | INT Vec MUL 69 | FMA 70 | AES 71 | Vec String 72 | FDIV 73 | Branch 74 | } 75 | rob -> sched1 -> rf -> pipe1 76 | 77 | # Source: Wikichip 78 | pipe2: Port 1 { 79 | grid-columns: 1 80 | ALU 81 | MUL 82 | INT Vec ALU 83 | INT Vec MUL 84 | FMA 85 | Bit Scan 86 | } 87 | rob -> sched1 -> rf -> pipe2 88 | 89 | # Source: Wikichip 90 | pipe3: Port 5 { 91 | grid-columns: 1 92 | ALU 93 | Vec Shuffle 94 | INT Vec ALU 95 | LEA 96 | } 97 | rob -> sched1 -> rf -> pipe3 98 | 99 | # Source: Wikichip 100 | pipe4: Port 6 { 101 | grid-columns: 1 102 | ALU 103 | Branch 104 | } 105 | rob -> sched1 -> rf -> pipe4 106 | 107 | # Source: Wikichip 108 | pipe5: Port 2 { 109 | AGU 110 | Load Data 111 | } 112 | rob -> sched1 -> rf -> pipe5 113 | 114 | # Source: Wikichip 115 | pipe6: Port 3 { 116 | AGU 117 | Load Data 118 | } 119 | rob -> sched1 -> rf -> pipe6 120 | 121 | # Source: Wikichip 122 | pipe7: Port 4 { 123 | Store Data 124 | } 125 | rob -> sched1 -> rf -> pipe7 126 | 127 | # Source: Wikichip 128 | pipe8: Port 7 { 129 | AGU 130 | } 131 | rob -> sched1 -> rf -> pipe8 132 | 133 | 134 | lsu: LSU { 135 | # Source: Wikichip 136 | 72-entry Load Queue 137 | 56-entry Store Queue 138 | } 139 | 140 | pipe5 -> lsu 141 | pipe6 -> lsu 142 | pipe7 -> lsu 143 | pipe8 -> lsu 144 | } 145 | frontend.rename -> backend.rob 146 | frontend.rename -> backend.bob 147 | 148 | mem: Memory { 149 | l1: L1 DC { 150 | # Source: Wikichip 151 | l1dc: 32KB 8-way L1DC 152 | 153 | # Source: Wikichip 154 | mshr: 10-entry MSHR 155 | } 156 | 157 | l2: L2 { 158 | # Source: Wikichip, Intel 159 | # "Skylake server microarchitecture implements a mid-level (L2) cache of 1 160 | # MB capacity with a minimum load-to-use latency of 14 cycles. The 161 | # mid-level cache capacity is four times larger than the capacity in 162 | # previous Intel Xeon processor family implementations. The line size of 163 | # the mid-level cache is 64B and it is 16-way associative. The mid-level 164 | # cache is private to each core." 165 | l2dc: 256 KB 4-way/1MB 16-way L2 DC 166 | 14 cycle load to use latency 167 | } 168 | 169 | # Source: Wikichip 170 | l1 -> l2: 64B/cycle 171 | 172 | l3: L3 { 173 | # Source: Wikichip 174 | l3dc: L3 Cache 175 | } 176 | l2 -> l3 177 | } 178 | frontend.l1ic -> mem.l2 179 | backend.lsu -> mem.l1 180 | 181 | info: |md 182 | Drawn by Jiajie Chen @jiegec 183 | 184 | Based on data from Wikichip 185 | | 186 | } -------------------------------------------------------------------------------- /docs/skylake.md: -------------------------------------------------------------------------------- 1 | # Intel Skylake 2 | 3 | ![](./skylake.svg) 4 | 5 | References: 6 | 7 | - [Skylake (client) - Microarchitectures - Intel](https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)) 8 | - [Skylake (server) - Microarchitectures - Intel ](https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(server)) 9 | -------------------------------------------------------------------------------- /docs/skymont.d2: -------------------------------------------------------------------------------- 1 | cpu : Intel Lunar Lake E-core Skymont CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | } 5 | 6 | l1ic: L1 IC { 7 | # Source: Intel 8 | l1ic: 64K L1 IC 9 | } 10 | 11 | # Source: Intel 12 | fq: 3x Fetch Queue 13 | bp -> fq 14 | fq -> l1ic 15 | 16 | iq1: Instruction Queue \#1 17 | l1ic -> iq1 18 | 19 | # Source: Intel 20 | decode1: 3-way Decode \#1 21 | iq1 -> decode1 22 | 23 | # Source: Intel 24 | # "Uop queue capacity: 64 -> 96 entries" 25 | uop1: 32-entry UOP Queue \#1 26 | decode1 -> uop1 27 | 28 | iq2: Instruction Queue \#2 29 | l1ic -> iq2 30 | 31 | # Source: Intel 32 | decode2: 3-way Decode \#2 33 | iq2 -> decode2 34 | 35 | # Source: Intel 36 | uop2: 32-entry UOP Queue \#2 37 | decode2 -> uop2 38 | 39 | # Source: Intel 40 | iq3: Instruction Queue \#3 41 | l1ic -> iq3 42 | 43 | # Source: Intel 44 | decode3: 3-way Decode \#3 45 | iq3 -> decode3 46 | 47 | # Source: Intel 48 | uop3: 32-entry UOP Queue \#3 49 | decode3 -> uop3 50 | 51 | # Source: Intel 52 | # "allocation (from 6)" 53 | rename: 8-way Rename { 54 | Move Elimination 55 | Zero Idiom 56 | } 57 | uop1 -> rename 58 | uop2 -> rename 59 | uop3 -> rename 60 | } 61 | 62 | backend: Backend { 63 | # Source: Intel 64 | # "Larger out of order window (ROB 256 -> 416 entry)" 65 | # "16-wide retire (from 8)" 66 | rob: 416-entry ROB, 16-wide retirement 67 | 68 | rf: Register File { 69 | irf: Integer Register File 70 | 71 | vrf: Vector Register File 72 | } 73 | 74 | sched1: Scheduler \#1 75 | 76 | # Source: Intel 77 | pipe1: Port 0 { 78 | grid-columns: 1 79 | ALU 80 | SHIFT 81 | MUL 82 | DIV 83 | } 84 | rob -> sched1 -> rf -> pipe1 85 | 86 | sched2: Scheduler \#2 87 | 88 | # Source: Intel 89 | pipe2: Port 5 { 90 | grid-columns: 1 91 | ALU 92 | } 93 | rob -> sched2 -> rf -> pipe2 94 | 95 | sched3: Scheduler \#3 96 | 97 | # Source: Intel 98 | pipe3: Port 2 { 99 | grid-columns: 1 100 | ALU 101 | SHIFT 102 | MUL 103 | DIV 104 | } 105 | rob -> sched3 -> rf -> pipe3 106 | 107 | sched4: Scheduler \#4 108 | 109 | # Source: Intel 110 | pipe4: Port 6 { 111 | grid-columns: 1 112 | ALU 113 | } 114 | rob -> sched4 -> rf -> pipe4 115 | 116 | sched5: Scheduler \#5 117 | 118 | # Source: Intel 119 | pipe5: Port 0 { 120 | grid-columns: 1 121 | ALU 122 | SHIFT 123 | } 124 | rob -> sched5 -> rf -> pipe5 125 | 126 | sched6: Scheduler \#6 127 | 128 | # Source: Intel 129 | pipe6: Port 0 { 130 | grid-columns: 1 131 | ALU 132 | } 133 | rob -> sched6 -> rf -> pipe6 134 | 135 | sched7: Scheduler \#7 136 | 137 | # Source: Intel 138 | pipe7: Port 3 { 139 | grid-columns: 1 140 | ALU 141 | SHIFT 142 | } 143 | rob -> sched7 -> rf -> pipe7 144 | 145 | sched8: Scheduler \#8 146 | 147 | # Source: Intel 148 | pipe8: Port 7 { 149 | grid-columns: 1 150 | ALU 151 | } 152 | rob -> sched8 -> rf -> pipe8 153 | 154 | sched9: Scheduler \#9 155 | 156 | # Source: Intel 157 | pipe9: Port 30 { 158 | grid-columns: 1 159 | JMP 160 | } 161 | rob -> sched9 -> rf -> pipe9 162 | 163 | sched10: Scheduler \#10 164 | 165 | # Source: Intel 166 | pipe10: Port 31 { 167 | grid-columns: 1 168 | JMP 169 | } 170 | rob -> sched10 -> rf -> pipe10 171 | 172 | sched11: Scheduler \#11 173 | 174 | # Source: Intel 175 | pipe11: Port 32 { 176 | grid-columns: 1 177 | JMP 178 | } 179 | rob -> sched11 -> rf -> pipe11 180 | 181 | sched12: Scheduler \#12 182 | 183 | # Source: Intel 184 | pipe12: Port 8 { 185 | grid-columns: 1 186 | STD 187 | } 188 | rob -> sched12 -> rf -> pipe12 189 | 190 | sched13: Scheduler \#13 191 | 192 | # Source: Intel 193 | pipe13: Port 9 { 194 | grid-columns: 1 195 | STD 196 | } 197 | rob -> sched13 -> rf -> pipe13 198 | 199 | sched14: Scheduler \#14 200 | 201 | # Source: Intel 202 | pipe14: Port 10 { 203 | grid-columns: 1 204 | LD AGU 205 | } 206 | rob -> sched14 -> rf -> pipe14 207 | 208 | sched15: Scheduler \#15 209 | 210 | # Source: Intel 211 | pipe15: Port 11 { 212 | grid-columns: 1 213 | LD AGU 214 | } 215 | rob -> sched15 -> rf -> pipe15 216 | 217 | sched16: Scheduler \#16 218 | 219 | # Source: Intel 220 | pipe16: Port 12 { 221 | grid-columns: 1 222 | LD AGU 223 | } 224 | rob -> sched16 -> rf -> pipe16 225 | 226 | sched17: Scheduler \#17 227 | 228 | # Source: Intel 229 | pipe17: Port 13 { 230 | grid-columns: 1 231 | ST AGU 232 | } 233 | rob -> sched17 -> rf -> pipe17 234 | 235 | sched18: Scheduler \#18 236 | 237 | # Source: Intel 238 | pipe18: Port 14 { 239 | grid-columns: 1 240 | ST AGU 241 | } 242 | rob -> sched18 -> rf -> pipe18 243 | 244 | sched19: Scheduler \#19 245 | 246 | # Source: Intel 247 | pipe19: Port 15 { 248 | grid-columns: 1 249 | ST AGU 250 | } 251 | rob -> sched19 -> rf -> pipe19 252 | 253 | sched20: Scheduler \#20 254 | 255 | # Source: Intel 256 | pipe20: Port 16 { 257 | grid-columns: 1 258 | ST AGU 259 | } 260 | rob -> sched20 -> rf -> pipe20 261 | 262 | sched21: Scheduler \#21 263 | 264 | # Source: Intel 265 | pipe21: Port 28 { 266 | grid-columns: 1 267 | VEC STD 268 | } 269 | rob -> sched21 -> rf -> pipe21 270 | 271 | sched22: Scheduler \#22 272 | 273 | # Source: Intel 274 | pipe22: Port 29 { 275 | grid-columns: 1 276 | VEC STD 277 | } 278 | rob -> sched22 -> rf -> pipe22 279 | 280 | lsu: LSU { 281 | Load Queue 282 | Store Queue 283 | } 284 | 285 | pipe12 -> lsu 286 | pipe13 -> lsu 287 | pipe14 -> lsu 288 | pipe15 -> lsu 289 | pipe16 -> lsu 290 | pipe17 -> lsu 291 | pipe18 -> lsu 292 | pipe19 -> lsu 293 | pipe20 -> lsu 294 | pipe21 -> lsu 295 | pipe22 -> lsu 296 | 297 | sched23: Scheduler \#23 298 | 299 | # Source: Intel 300 | pipe23: Port 21 { 301 | grid-columns: 1 302 | SALU 303 | SHUF 304 | SIMUL 305 | FADD 306 | FMA 307 | } 308 | rob -> sched23 -> rf -> pipe23 309 | 310 | sched24: Scheduler \#24 311 | 312 | # Source: Intel 313 | pipe24: Port 20 { 314 | grid-columns: 1 315 | SALU 316 | SIMUL 317 | SHUF 318 | FADD 319 | FDIV 320 | AES 321 | SHA 322 | FMA 323 | } 324 | rob -> sched24 -> rf -> pipe24 325 | 326 | sched25: Scheduler \#25 327 | 328 | # Source: Intel 329 | pipe25: Port 22 { 330 | grid-columns: 1 331 | SALU 332 | SIMUL 333 | SHUF 334 | FADD 335 | FDIV 336 | AES 337 | FMA 338 | } 339 | rob -> sched25 -> rf -> pipe25 340 | 341 | sched26: Scheduler \#26 342 | 343 | # Source: Intel 344 | pipe26: Port 23 { 345 | grid-columns: 1 346 | SALU 347 | SHUF 348 | SIMUL 349 | FADD 350 | FMA 351 | } 352 | rob -> sched26 -> rf -> pipe26 353 | } 354 | 355 | frontend.rename -> backend.rob 356 | 357 | mem: Memory { 358 | l1: L1 DC { 359 | # Source: Intel 360 | l1dc: 32KB L1 DC 361 | } 362 | 363 | l2: L2 { 364 | # Source: Intel 365 | l2dc: 4MB L2 Cache shared among 4 cores 366 | } 367 | 368 | l1 -> l2 369 | } 370 | frontend.l1ic -> mem.l2 371 | backend.lsu -> mem.l1 372 | 373 | info: |md 374 | Drawn by Jiajie Chen @jiegec 375 | 376 | Based on data from Intel 377 | | 378 | } -------------------------------------------------------------------------------- /docs/skymont.md: -------------------------------------------------------------------------------- 1 | # Intel Lunar Lake E-core aka Skymont 2 | 3 | ![](./skymont.svg) 4 | 5 | References: 6 | 7 | - [Intel Details Skymont](https://chipsandcheese.com/2024/06/15/intel-details-skymont/) 8 | - [Thoughts on Skymont Slides](https://chipsandcheese.com/2024/05/30/thoughts-on-skymont-slides/) 9 | - [月光下的新探索:Lunar Lake CPU (Lion Cove / Skymont) 微架构测试](https://blog.hjc.im/lunar-lake-cpu-uarch-review.html) 10 | - [Skymont: Intel’s E-Cores reach for the Sky](https://chipsandcheese.com/2024/10/03/skymont-intels-e-cores-reach-for-the-sky/) 11 | -------------------------------------------------------------------------------- /docs/sunny_cove.d2: -------------------------------------------------------------------------------- 1 | cpu : Intel Ice Lake Sunny Cove CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Chips and Cheese 5 | l1btb: 256-entry L1 BTB 6 | 7 | # Source: Chips and Cheese 8 | l2btb: 5120-entry L2 BTB 9 | 10 | # Source: Chips and Cheese 11 | ras: 22-entry RAS 12 | } 13 | 14 | l1ic: L1 IC { 15 | # Source: Chips and Cheese 16 | itlb: 128-entry 8-way ITLB 17 | 18 | # Source: Chips and Cheese 19 | l1ic: 32KB 8-way L1 IC 20 | } 21 | 22 | bp -> l1ic 23 | 24 | # Source: Chips and Cheese, Intel 25 | # "70/thread; 140/1 thread" 26 | iq: 2x 70-entry Instruction Queue 27 | l1ic -> iq 28 | 29 | # Source: Chips and Cheese 30 | decode: 4-way Decode 31 | iq -> decode 32 | 33 | # Source: Chips and Cheese 34 | uopc: 2304-entry UOP Cache 35 | decode -> uopc 36 | bp -> uopc 37 | 38 | # Source: Chips and Cheese 39 | # Source: Intel, called allocation queue 40 | uop: 2x 70-entry UOP Queue 41 | uopc -> uop 42 | decode -> uop 43 | 44 | # Source: Chips and Cheese 45 | rename: 5-way Rename { 46 | Move Elimination 47 | Zero Idiom 48 | } 49 | uop -> rename 50 | } 51 | 52 | backend: Backend { 53 | # Source: Chips and Cheese, Intel 54 | rob: 352-entry ROB 55 | 56 | # Source: Chips and Cheese 57 | bob: 96-entry Branch Order Buffer 58 | 59 | rf: "" { 60 | # Source: Chips and Cheese, Intel 61 | # Source: jiegec, 239 speculative 62 | irf: 280-entry Integer Register File 63 | 64 | # Source: Chips and Cheese, Intel 65 | # Source: jiegec, 158 speculative 66 | vrf: 224-entry FP/Vector Register File 67 | 68 | # Source: jiegec, 237 speculative 69 | flagsrf: 238-entry Flags Register File 70 | 71 | # Source: Chips and Cheese 72 | mxscrrf: 8-entry MXCSR Register File 73 | 74 | # Source: Chips and Cheese 75 | maskrf: 152-entry Mask Register File 76 | } 77 | 78 | # Source: uops.info 79 | # 10 ports in total: Port 0 to Port 9 80 | # Integer/Memory: 81 | # Port 2 = Port 3: Load AGU 82 | # Port 4 = Port 9: Store Data 83 | # Port 7 = Port 8: Store AGU 84 | # ALU(add): Port 0, 1, 5, 6 85 | # LEA(lea): Port 0, 1, 5, 6 86 | # Shift(shl): Port 0, 6 87 | # INT MUL(imul): Port 1 88 | # INT MUL(imul 64b*64b=128b): Port 1(LO) + Port 5(HI) 89 | # INT DIV(idiv): (Port 0, 1, 5, 6) + Port 1 90 | # JMP/Branch(jmp/jnz): Port 0, 6 91 | # Load(mov): Port 2, 3 92 | # Store(mov): Data(Port 4, 9) + AGU(Port 7, 8) 93 | # 256-bit ALU(VPADDD YMM): Port 0, 1, 5 94 | # 512-bit ALU(VPADDD ZMM): Port 0, 5 95 | # 256-bit FMA(VFMADD132PD YMM): Port 0, 1 96 | # 512-bit FMA(VFMADD132PD ZMM): Port 0 97 | # 256-bit FADD(VADDPS YMM): Port 0, 1 98 | # 512-bit FADD(VADDPS ZMM): Port 0 99 | # 256-bit FDIV(VDIVPS YMM): Port 0 100 | # 256-bit Shuffle(VPSHUFD YMM): Port 1, 5 101 | # 512-bit Shuffle(VPSHUFD ZMM): Port 5 102 | # 256-bit Shift(VPSLLD YMM): Port 0, 1 103 | # 512-bit Shift(VPSLLD YMM): Port 0 104 | 105 | # Source: Chips and Cheese 106 | # Source: jiegec, 81 sched size for fp 107 | # Source: jiegec, 40 sched size for f2i 108 | # Source: Intel, 160 scheduler entries in total(160=80+34+23+23) 109 | sched1: 80-entry Unified Math Scheduler 110 | 111 | # Source: Chips and Cheese 112 | pipe1: Port 0 { 113 | grid-columns: 1 114 | ALU 115 | LEA 116 | Shift 117 | Jump/Branch 118 | 512b ALU 119 | 512b FADD 120 | 512b FMA 121 | 512b Shift 122 | } 123 | rob -> sched1 -> rf -> pipe1 124 | 125 | # Source: Chips and Cheese 126 | pipe2: Port 1 { 127 | grid-columns: 1 128 | ALU 129 | LEA 130 | INT MUL 131 | INT DIV 132 | 256b ALU 133 | 256b FADD 134 | 256b FMA 135 | 256b Shift 136 | 256b Shuffle 137 | } 138 | rob -> sched1 -> rf -> pipe2 139 | 140 | # Source: Chips and Cheese 141 | pipe3: Port 5 { 142 | grid-columns: 1 143 | ALU 144 | LEA 145 | INT MUL HI 146 | 512b ALU 147 | 512b Shuffle 148 | } 149 | rob -> sched1 -> rf -> pipe3 150 | 151 | # Source: Chips and Cheese 152 | pipe4: Port 6 { 153 | grid-columns: 1 154 | ALU 155 | LEA 156 | Shift 157 | Jump/Branch 158 | } 159 | rob -> sched1 -> rf -> pipe4 160 | 161 | # Source: Chips and Cheese 162 | # Source: jiegec, 36 sched size for store data 163 | sched2: 34-entry Store Data Scheduler 164 | 165 | # Source: Chips and Cheese 166 | pipe5: Port 4 { 167 | Store Data 168 | } 169 | rob -> sched2 -> rf -> pipe5 170 | 171 | # Source: Chips and Cheese 172 | pipe6: Port 9 { 173 | Store Data 174 | } 175 | rob -> sched2 -> rf -> pipe6 176 | 177 | # Source: Chips and Cheese 178 | # Source: jiegec, 48(~46=23+23) sched size for load & store address 179 | sched3: 23-entry AGU Scheduler \#1 180 | 181 | # Source: Chips and Cheese 182 | pipe7: Port 2 { 183 | Load AGU 184 | } 185 | rob -> sched3 -> rf -> pipe7 186 | 187 | # Source: Chips and Cheese 188 | pipe8: Port 8 { 189 | Store AGU 190 | } 191 | rob -> sched3 -> rf -> pipe8 192 | 193 | # Source: Chips and Cheese 194 | sched4: 23-entry AGU Scheduler \#2 195 | 196 | # Source: Chips and Cheese 197 | pipe9: Port 3 { 198 | Load AGU 199 | } 200 | rob -> sched4 -> rf -> pipe9 201 | 202 | # Source: Chips and Cheese 203 | pipe10: Port 7 { 204 | Store AGU 205 | } 206 | rob -> sched4 -> rf -> pipe10 207 | 208 | lsu: LSU { 209 | # Source: Chips and Cheese, Intel 210 | 128-entry Load Queue 211 | 72-entry Store Queue 212 | } 213 | 214 | pipe5 -> lsu 215 | pipe6 -> lsu 216 | pipe7 -> lsu 217 | pipe8 -> lsu 218 | pipe9 -> lsu 219 | pipe10 -> lsu 220 | } 221 | frontend.rename -> backend.rob 222 | frontend.rename -> backend.bob 223 | 224 | mem: Memory { 225 | l1: L1 DC { 226 | # Source: Chips and Cheese 227 | l1dtlb: 64-entry load, 16-entry store L1 DTLB 228 | 229 | # Source: Chips and Cheese, Intel 230 | # Intel says 8-way? 231 | l1dc: 48KB 12-way L1DC, 5 cycle latency, 64B cache line 232 | 233 | # Source: Chips and Cheese 234 | mshr: 12-entry MSHR 235 | } 236 | 237 | l2: L2 { 238 | # Source: Chips and Cheese, Intel 239 | l2tlb: 2048-entry L2 TLB 240 | 241 | # Source: Chips and Cheese 242 | l2dc: 512 KB 8-way/1280 KB 20-way L2 DC, 13 cycle latency 243 | 244 | # Source: Chips and Cheese 245 | mshr: 32-entry MSHR 246 | } 247 | 248 | # Source: Chips and Cheese 249 | l1 -> l2: 64B/cycle 250 | 251 | l3: L3 { 252 | # Source: Chips and Cheese 253 | l3dc: up to 2MB/core 16-way L3 Cache 254 | } 255 | l2 -> l3 256 | } 257 | frontend.l1ic -> mem.l2 258 | backend.lsu -> mem.l1 259 | 260 | info: |md 261 | Drawn by Jiajie Chen @jiegec 262 | 263 | Based on data from Chips and Cheese, Intel 264 | | 265 | } -------------------------------------------------------------------------------- /docs/sunny_cove.md: -------------------------------------------------------------------------------- 1 | # Intel Ice Lake aka Sunny Cove 2 | 3 | ![](./sunny_cove.svg) 4 | 5 | References: 6 | 7 | - [Sunny Cove: Intel’s Lost Generation](https://chipsandcheese.com/2022/06/07/sunny-cove-intels-lost-generation/) 8 | - [Popping the Hood on Golden Cove](https://chipsandcheese.com/2021/12/02/popping-the-hood-on-golden-cove/) 9 | - [Sunny Cove - Microarchitectures - Intel](https://en.wikichip.org/wiki/intel/microarchitectures/sunny_cove) 10 | - [Golden Cove’s Vector Register File: Checking with Official (SPR) Data](https://chipsandcheese.com/2023/01/15/golden-coves-vector-register-file-checking-with-official-spr-data/) 11 | - [4th Gen Intel Xeon Scalable Sapphire Rapids Leaps Forward](https://www.servethehome.com/4th-gen-intel-xeon-scalable-sapphire-rapids-leaps-forward/7/) 12 | - Intel 64 and IA-32 Architectures Optimization Reference Manual Volume 1 13 | - [Built for the Edge: The Next-Generation Intel® Xeon D 2700 & 1700 processors](https://hc34.hotchips.org/assets/program/conference/day2/Mobile%20and%20Edge/HC2022.XeonDx700.PraveenMosur.FINAL.pdf) 14 | -------------------------------------------------------------------------------- /docs/uarch.csv: -------------------------------------------------------------------------------- 1 | uArch,L1 BTB,L2 BTB,L3 BTB,ITA,RAS,L1 ITLB,L2 ITLB,L1 DTLB,L2 DTLB,L2 Unified TLB,L1 IC,Decode width,UOP/MOP Cache,UOP/MOP width,Rename width,ROB,Int RF,Flag RF,Float RF,Load to use latency,Load to FP use latency,Branch units,ALU units,FP/Vec units,DP FLOP/cycle,Load/Store pipes,Load-only pipes,Store-only pipes 2 | AMD Zen 1,8 (0 bubble),256 (1 bubble),4096 (4 bubble),512,32,64,512,64,1536,,4-way 64KB,4,8-way 256-entry 8-inst/entry,8,6+4,192,168,,160,4,7,2,4,4x 128b,8,0,2,1 3 | AMD Zen 2,16 (0 bubble),512 (1 bubble),7168 (4 bubble),1024,32,64,512,64,2048,,8-way 32KB,4,8-way 512-entry 8-inst/entry,8,6+4,224,180,138,160,4,7,2,4,4x 256b,16,0,2,1 4 | AMD Zen 3,1024 (0 bubble),6656 (3 bubble),,1536,2x32,64,512,64,2048,,8-way 32KB,4,8-way 512-entry 8-mop/entry,8,6,256,192,122,160,4,7,2,4,4x 256b,16,2,1,0 5 | AMD Zen 4,1536 (0 bubble),7168/7680 (3 bubble),,3072,2x32,64,512,72,3072,,8-way 32KB,4,12-way 768-entry 9-mop/entry,9,6,320,224,126,192,4,7,2,4,4x 256b,16,2,1,0 6 | AMD Zen 5,16384 (0 bubble),8192 (8 bubble),,3072,2x52,64,2048,96,4096,,8-way 32KB,2x4,16-way 1024-entry 6-inst/entry,2x6,8,448,240,192,384,4,7,3,6,4x 512b,32,2,2,0 7 | Ampere One,256 (0 bubble),8192 (2 bubble),,,,64,768,64,1536,,4-way 16KB,5,,,4,208,166,,128,,,2,4,2,,0,2,2 8 | Apple Avalanche,1024 (0 bubble),3072 (1 bubble),192KB L1 IC (2 bubble),,50,192,,,,,192KB,8,N/A,N/A,8,274 Group,350,,380,3-4,,2,6,4x 128b,16,1,2,1 9 | Apple Firestorm,1024 (0 bubble),192KB L1 IC (2 bubble),,,50,192,,160,,3072,6-way 192KB,8,N/A,N/A,8,330 Group,380,128,432,3-4,,2,6,4x 128b,16,1,2,1 10 | Apple Icestorm,,,,,32,128,,128,,,128KB,4,N/A,N/A,4,,,,,3-4,,2,4,2x 128b,8,1,0,1 11 | ARM Cortex-A77,64 (0 bubble),8192,,,,48,,48,,1280,4-way 64KB,6,1536-entry,6,6,160 MOP,,,,4,5,2,4,2,,2,0,0 12 | ARM Cortex-X1,96 (0 bubble),8192 (1 bubble),,,16,48,,40,,2048,4-way 64KB,5,4-way 3072-entry,8,8,224 MOP,,,,4,6,2,4,4,8,2,1,0 13 | ARM Cortex-X2,,,,,,48,,48,,2048,4-way 64KB,5,4-way 3072-entry,8,8,288 MOP,,,,4,6,2,4,4,,2,1,0 14 | ARM Cortex-X3,,,,,32,48,,,,,4-way 64KB,6,4-way 1536-entry,8,8,320 MOP,,,,4,6,2,6,4,,2,1,0 15 | ARM Cortex-X4,,,,,32,48,,96,,2048,4-way 64KB,10,N/A,N/A,10,384 MOP,,,,,,3,8,4,,1,2,1 16 | ARM Cortex-X925,,,,,32,128,,96,,2048,4-way 64KB,10,N/A,N/A,10,?,,,,4,6,3,8,6,,2,2,0 17 | Intel Crestmont,1024 (0 bubble),6144 (2 bubble),,,,64,,48,,3072,8-way 64KB,2x3,,,6,256,214,,207,3-4,,2,4,3,,0,2,2 18 | Intel Golden Cove,128 (0 bubble),6144 (1 bubble),12288 (2 bubble),,20,256,,96,,2048,8-way 32KB,6,4096-entry,8,6,512,288,248,332,5,,2,5,3,,0,3,2 19 | Intel Gracemont,1024 (0 bubble),5120 (2 bubble),,,,64,,32,,2048,8-way 64KB,2x3,,,5,256,214,,207,3-4,,2,4,3x 128b,8,0,2,2 20 | Intel Lion Cove,,,,,,,,128,,,,8,,12,8,576,,,,4,,3,6,4x 256b,,?,?,? 21 | Intel Redwood Cove,128 (0 bubble),2x6144 (1 bubble),,,,,,96,,,64KB,6,,8,6,512,,,,5,,2,5,3x 256b,,?,?,? 22 | Intel Skymont,,,,,,128,,,,,64KB,3x3,,3x3,8,416,,,,,,3,8,4x 128b,,?,?,? 23 | Intel Sunny Cove,256 (0 bubble),5120 (1 bubble),,,22,128,,64,,2048,8-way 32KB,4,2304-entry,,5,352,280,238,224,5,,2,4,2x 512b,32,0,2,2 24 | Qualcomm Oryon,2048 (0 bubble),192KB L1 IC (2 bubble),,,50,256,,224,,8192,6-way 192KB,8,N/A,N/A,8,680,416,,416,3-4,,2,6,4x 128b,16,2,2,0 25 | -------------------------------------------------------------------------------- /docs/xiaomi.d2: -------------------------------------------------------------------------------- 1 | cpu : Phytium Xiaomi CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Phytium 5 | # "2048-entry BTB" 6 | l1btb: 2048-entry BTB 7 | 8 | # Source: Phytium 9 | # "512-entry indirect predictor" 10 | indir: 512-entry Indirect Predictor 11 | 12 | # Source: Phytium 13 | # "48-entry Speculative Return Stac" 14 | ras: 48-entry Return Address Stack 15 | 16 | # Source: Phytium 17 | # "Direction predict with TAGE predictor" 18 | tage: TAGE-based direction predictor 19 | } 20 | 21 | l1ic: L1 IC { 22 | # Source: Phytium 23 | # "32KB L1 instr. Cach" 24 | l1ic: 32KB L1 IC 25 | } 26 | 27 | # Source: Phytium 28 | # "Loop detect and Instr. Cache bypass" 29 | loop: Loop Buffer 30 | 31 | fq: Fetch Queue 32 | bp -> fq 33 | fq -> l1ic 34 | 35 | # Source: Phytium 36 | # "32-entry instruction buffer" 37 | iq: 32-entry Instruction Buffer 38 | 39 | loop -> iq 40 | l1ic -> iq 41 | 42 | # Source: Phytium 43 | # "Up to four instructions decoded per cycle" 44 | decode: 4-way Decode 45 | iq -> decode 46 | 47 | # Source: Phytium 48 | # "Up to four instructions dispatched per cycle" 49 | dispatch: 4-way Dispatch 50 | decode -> dispatch 51 | } 52 | 53 | backend: Backend { 54 | # Source: Phytium 55 | # "Up to four instructions renamed per cycle" 56 | rename: Rename 57 | 58 | # Source: Phytium 59 | # "Reorder buffer can hold 160 instructions" 60 | rob: 160-entry ROB 61 | rename -> rob 62 | 63 | rf: Register File { 64 | # Source: Phytium 65 | # "192 physical registers" 66 | irf: 192-entry Integer Register File 67 | 68 | vrf: Vector Register File 69 | } 70 | 71 | # Source: Phytium 72 | # "Two separated 16-entry integer and ASIMD queues shared by four integer units" 73 | sched1: 16-entry Single-cycle ALU Scheduler 74 | 75 | # Source: Phytium 76 | pipe1: Pipe \#1 { 77 | ALU 78 | BR 79 | } 80 | sched1 -> rf -> pipe1 81 | 82 | # Source: Phytium 83 | pipe2: Pipe \#2 { 84 | ALU 85 | BR 86 | } 87 | sched1 -> rf -> pipe2 88 | 89 | # Source: Phytium 90 | # "Two separated 16-entry integer and ASIMD queues shared by four integer units" 91 | sched2: 16-entry Multi-cycle ALU Scheduler 92 | 93 | # Source: Phytium 94 | pipe3: Pipe \#3 { 95 | MUL 96 | DIV 97 | } 98 | sched2 -> rf -> pipe3 99 | 100 | # Source: Phytium 101 | pipe4: Pipe \#4 { 102 | MUL 103 | DIV 104 | } 105 | sched2 -> rf -> pipe4 106 | 107 | # Source: Phytium 108 | # "One shared 16-entry floating point and ASIMD queue" 109 | sched3: 16-entry FP/Vector Scheduler 110 | 111 | # Source: Phytium 112 | pipe5: Pipe \#5 { 113 | FMAC 114 | FDIV 115 | } 116 | sched3 -> rf -> pipe5 117 | 118 | # Source: Phytium 119 | pipe6: Pipe \#6 { 120 | FMAC 121 | FDIV 122 | } 123 | sched3 -> rf -> pipe6 124 | 125 | # Source: Phytium 126 | sched4: AGU Scheduler 127 | 128 | lsu: LSU { 129 | # Source: Phytium 130 | # "One 24-entry load/store queue" 131 | 24-entry Load/Store Queue 132 | 133 | # Source: Phytium 134 | # "4 cycles latency from load to use" 135 | 4 cycle load to use latency 136 | } 137 | 138 | sched4 -> rf -> lsu 139 | 140 | rob -> sched1 141 | rob -> sched2 142 | rob -> sched3 143 | rob -> sched4 144 | } 145 | frontend.dispatch -> backend.rename 146 | 147 | mem: Memory { 148 | # Source: Phytium 149 | # "32KB L1 data cache" 150 | l1dc: 32KB L1DC { 151 | # Source: Phytium 152 | 6 outstanding loads 153 | } 154 | 155 | l2: L2 Cache 156 | l1dc -> l2 157 | } 158 | frontend.l1ic -> mem.l2 159 | backend.lsu -> mem.l1dc 160 | 161 | info: |md 162 | Drawn by Jiajie Chen @jiegec 163 | 164 | Based on data from Phytium 165 | | 166 | } -------------------------------------------------------------------------------- /docs/xiaomi.md: -------------------------------------------------------------------------------- 1 | # Phytium Xiaomi 2 | 3 | ![](./xiaomi.svg) 4 | 5 | References: 6 | 7 | - [Mars: A 64-core ARMv8 Processor](https://old.hotchips.org/wp-content/uploads/hc_archives/hc27/HC27.24-Monday-Epub/HC27.24.30-HP-Cloud-Comm-Epub/HC27.24.321-64core-Zhang-phytium-v1.0.pdf) -------------------------------------------------------------------------------- /docs/zen1.d2: -------------------------------------------------------------------------------- 1 | cpu: AMD Zen1 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: AMD 5 | # "L0BTB holds 4 forward taken branches and 4 backward taken branches, and 6 | # predicts with zero bubbles. L1BTB has 256 entries and creates one bubble 7 | # if prediction differs from L0BTB. L2BTB has 4096 entries and creates 8 | # four bubbles if its prediction differs from L1BTB." 9 | l1btb: 8-entry(4 forward, 4 backward) L1 BTB, zero bubbles 10 | 11 | # Source: AMD 12 | l2btb: 256-entry L2 BTB, one bubble 13 | 14 | # Source: AMD 15 | l3btb: 4096-entry L3 BTB, four bubbles 16 | 17 | # Source: AMD 18 | # "The processor implements a 512-entry indirect target array" 19 | indir: 512-entry Indirect Target Array 20 | 21 | # Source: AMD 22 | # "The processor implements a 32-entry return address stack (RAS) to 23 | # predict return addresses from a near call." 24 | ras: 32-entry Return Address Stack 25 | } 26 | 27 | l1ic: L1 IC { 28 | # Source: AMD 29 | # "The processor contains a fully-associative L1 instruction TLB (ITLB) 30 | # with 64 entries that can hold 4- Kbyte, 2-Mbyte, or 1-Gbyte page 31 | # entries." 32 | l1itlb: 64-entry L1 ITLB 33 | 34 | # Source: AMD 35 | # "The processor provides an 8-way set associative L2 instruction TLB with 36 | # 512 entries capable of holding 4-Kbyte pages, and 2-Mbyte pages. 1-Gbyte 37 | # pages are not held in the L2 instruction TLB; they are smashed into 38 | # 2-Mbyte pages in the L2 ITLB." 39 | l2itlb: 512-entry 8-way L2 ITLB 40 | 41 | # Source: AMD 42 | # "The AMD Family 17h processor contains a 64-Kbyte, 4-way set associative 43 | # L1 instruction cache." 44 | l1ic: 64KB 4-way L1 IC 45 | } 46 | 47 | fq: Fetch Queue 48 | bp -> fq 49 | fq -> l1ic 50 | 51 | # Source: AMD 52 | # "The fetch unit sends these bytes to the decode unit through a 20 entry 53 | # Instruction Byte Queue (IBQ), each entry holding 16 instruction bytes. In 54 | # SMT mode each thread has 10 dedicated IBQ entries" 55 | iq: 20x 16B Instruction Byte Queue 56 | # Source: AMD 57 | # "The AMD Family 17h processor fetches instructions from the instruction 58 | # cache in 32-byte naturally aligned blocks. The processor can perform an 59 | # instruction block fetch every cycle." 60 | l1ic -> iq: 32 B/cyc 61 | 62 | # Source: AMD 63 | # "The decode unit scans two of these windows in a given cycle, decoding a 64 | # maximum of four instructions." 65 | decode: 4-way Decode 66 | iq -> decode: 2 IBQ entry 67 | 68 | # Source: AMD 69 | # "The op cache is organized as an associative cache with 32 sets and 8 70 | # ways. At each set-way intersection is an entry containing up to 8 71 | # instructions, so the maximum capacity of the op cache is then 2K 72 | # instructions." 73 | uopc: 256-entry 8-way, 8 inst/entry UOP Cache 74 | decode -> uopc 75 | bp -> uopc 76 | 77 | # Source: AMD 78 | # "the maximum throughput from the op cache is 8 instructions per cycle 79 | # whereas the maximum throughput from the traditional fetch and decode 80 | # pipeline is 4 instructions per cycle." 81 | uop: UOP Queue 82 | uopc -> uop: 8 inst/cycle 83 | decode -> uop: 4 inst/cycle 84 | 85 | rename: Rename { 86 | Move Elimination 87 | Zero Idiom 88 | } 89 | uop -> rename 90 | } 91 | 92 | backend: Backend { 93 | # Source: AMD 94 | # "The unit can receive up to 6 macro ops dispatched per cycle and track up 95 | # to 192 macro ops in-flight. " 96 | # "The retire unit handles in-order commit of up to eight macro ops per 97 | # cycle." 98 | rob: 192-entry ROB, retire 8 op/cycle 99 | 100 | rf: Register File { 101 | # Source: AMD 102 | # "The integer physical register file (PRF) consists of 168 registers, 103 | # with up to 38 per thread mapped to architectural state or 104 | # microarchitectural temporary state." 105 | irf: 168-entry Integer Register File 106 | 107 | # Source: AMD 108 | # 160 entry Physical Register File in Figure 6 109 | vrf: 160-entry FP/Vector Register File 110 | } 111 | 112 | # Source: AMD 113 | # "ALU micro ops are sent to one of four 14-entry ALU schedulers" 114 | sched1: 14-entry ALU Scheduler \#1 115 | 116 | pipe1: Pipe \#1 { 117 | ALU 118 | } 119 | sched1 -> rf -> pipe1 120 | 121 | # Source: AMD 122 | sched2: 14-entry ALU Scheduler \#2 123 | 124 | pipe2: Pipe \#2 { 125 | ALU 126 | } 127 | sched2 -> rf -> pipe2 128 | 129 | # Source: AMD 130 | sched3: 14-entry ALU Scheduler \#3 131 | 132 | pipe3: Pipe \#3 { 133 | ALU 134 | } 135 | sched3 -> rf -> pipe3 136 | 137 | # Source: AMD 138 | sched4: 14-entry ALU Scheduler \#4 139 | 140 | # Source: g 141 | pipe4: Pipe \#4 { 142 | ALU 143 | } 144 | sched4 -> rf -> pipe4 145 | 146 | # Source: AMD 147 | # "Load and Store micro ops are sent to one of two 14-entry address 148 | # generation units (AGUs) Each scheduler can issue one micro op per cycle." 149 | sched5: 14-entry AGU Scheduler \#5 150 | 151 | # Source: g 152 | pipe5: Pipe \#5 { 153 | Load AGU 154 | Store AGU 155 | } 156 | sched5 -> rf -> pipe5 157 | 158 | sched6: 14-entry AGU Scheduler \#6 159 | 160 | pipe6: Pipe \#6 { 161 | Load AGU 162 | Store AGU 163 | } 164 | sched6 -> rf -> pipe6 165 | 166 | lsu: LSU { 167 | # Source: AMD 168 | # "The LS unit includes a 44-entry load queue (LDQ)." 169 | 44-entry Load Queue 170 | # Source: AMD 171 | # "The LS unit utilizes a 44-entry store queue " 172 | 44-entry Store Queue 173 | 174 | # Source: AMD 175 | # "The LS unit contains three largely independent pipelines enabling the 176 | # execution of two 128-bit load memory operations and one 128-bit store 177 | # memory operation per cycle." 178 | 2x 128b Load Pipe 179 | 1x 128b Store Pipe 180 | 181 | # Source: jiegec, AMD 182 | # AMD: "4-cycle load-to-use integer load latency and 7-cycle load-to-use FP load latency" 183 | 4 cycle load to use latency 184 | 7 cycle load to FP use latency 185 | } 186 | 187 | pipe5 -> lsu 188 | pipe6 -> lsu 189 | 190 | rob -> sched1 191 | rob -> sched2 192 | rob -> sched3 193 | rob -> sched4 194 | rob -> sched5 195 | rob -> sched6 196 | 197 | # Source: AMD 198 | # "The floating-point scheduler has a 36 entry micro-op capacity" 199 | sched7: 36-entry Vector/FP Scheduler \#7 200 | rob -> sched7 201 | 202 | # Source: AMD 203 | pipe7: Pipe \#7 { 204 | FMUL 205 | FMISC 206 | VADD 207 | VMUL 208 | VMISC 209 | AES 210 | } 211 | sched7 -> rf -> pipe7 212 | 213 | # Source: AMD 214 | pipe8: Pipe \#8 { 215 | FMUL 216 | FMISC 217 | VADD 218 | VSHUF 219 | VMISC 220 | AES 221 | CLM 222 | } 223 | sched7 -> rf -> pipe8 224 | 225 | # Source: AMD 226 | pipe9: Pipe \#9 { 227 | FADD 228 | FMISC 229 | STORE 230 | VSHUF 231 | VSHIFT 232 | VMISC 233 | } 234 | sched7 -> rf -> pipe9 -> lsu 235 | 236 | # Source: AMD 237 | pipe10: Pipe \#10 { 238 | FADD 239 | FCVT 240 | FDIV 241 | FMISC 242 | VADD 243 | VMISC 244 | } 245 | sched7 -> rf -> pipe10 246 | } 247 | frontend.rename -> backend.rob 248 | 249 | mem: Memory { 250 | l1: L1 DC { 251 | # Source: AMD 252 | # "The fully-associative L1 data TLB (DTLB) provides 64 entries that hold 253 | # 4-Kbyte, 2-Mbyte, or 1- Gbyte page entries." 254 | l1dtlb: 64-entry L1 DTLB 255 | 256 | # Source: AMD 257 | # "The L2 data TLB provides a unified 12-way set-associative L2 data TLB 258 | # with 1536 entries" 259 | l2dtlb: 1536-entry 12-way L2 DTLB 260 | 261 | # Source: AMD 262 | # "The AMD Family 17h processor contains a 32-Kbyte, 8-way set associative 263 | # L1 data cache" 264 | l1dc: 32KB 8-way L1DC 265 | 266 | # Source: AMD 267 | # "A hardware table walker loads page table information into the TLBs." 268 | ptw: 1 Page Table Walker 269 | } 270 | 271 | l2: L2 { 272 | # Source: AMD 273 | # "The AMD Family 17h processor implements a unified 8-way set associative 274 | # write-back L2 cache per core. This on-die L2 cache is inclusive of the 275 | # L1 caches in the core. The L2 cache size is 512 Kbytes with a variable 276 | # load-to-use latency of no less than 12 cycles." 277 | l2dc: 512KB 8-way L2 Cache 278 | } 279 | 280 | # Source: AMD 281 | # "The L2 to L1 data path is 32 bytes wide." 282 | l1 -> l2: 32B/cycle 283 | 284 | l3: L3 { 285 | # Source: AMD 286 | # "The AMD Family 17h processor implements a 4 MB or 8-MB L3 cache 287 | # (depending on SOC configuration) that is 16-way set associative and 288 | # shared by four cores inside a CPU complex." 289 | l3dc: 4MB/8MB 16-way L3 Cache 290 | } 291 | l2 -> l3 292 | } 293 | frontend.l1ic -> mem.l2 294 | backend.lsu -> mem.l1 295 | 296 | info: |md 297 | Drawn by Jiajie Chen @jiegec 298 | 299 | Based on data from AMD 300 | | 301 | } 302 | -------------------------------------------------------------------------------- /docs/zen1.md: -------------------------------------------------------------------------------- 1 | # AMD Zen1 2 | 3 | ![](./zen1.svg) 4 | 5 | References: 6 | 7 | - Software Optimization Guide for AMD Family 17h Processors 8 | - [The AMD Zen and Ryzen 7 Review: A Deep Dive on 1800X, 1700X and 1700](https://www.anandtech.com/show/11170/the-amd-zen-and-ryzen-7-review-a-deep-dive-on-1800x-1700x-and-1700/4) 9 | -------------------------------------------------------------------------------- /docs/zen2.md: -------------------------------------------------------------------------------- 1 | # AMD Zen2 2 | 3 | ![](./zen2.svg) 4 | 5 | References: 6 | 7 | - [Deep Diving Neoverse N1](https://chipsandcheese.com/2021/10/22/deep-diving-neoverse-n1/) 8 | - [AMD Zen 2 Microarchitecture Analysis: Ryzen 3000 and EPYC Rome](https://www.anandtech.com/show/14525/amd-zen-2-microarchitecture-analysis-ryzen-3000-and-epyc-rome/6) 9 | - [AMD Zen 3 Ryzen Deep Dive Review: 5950X, 5900X, 5800X and 5600X Tested](https://www.anandtech.com/show/16214/amd-zen-3-ryzen-deep-dive-review-5950x-5900x-5800x-and-5700x-tested/4) 10 | - [X86SchedulerZnver2.td in LLVM](https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/X86/X86ScheduleZnver2.td) 11 | - Software Optimization Guide for AMD Family 17h Models 30h and Greater Processors 12 | - [Zen 2 Dieshot](https://www.bilibili.com/video/BV1Af421i7jY/) 13 | -------------------------------------------------------------------------------- /docs/zen3.d2: -------------------------------------------------------------------------------- 1 | cpu : AMD Zen3 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Chips and Cheese, AMD 5 | # "L1BTB has 1024 entries and predicts with zero bubbles for conditional 6 | # and unconditional direct branches, and one cycle for calls, returns and 7 | # indirect branches. L2BTB has 6656 entries and creates three bubbles if 8 | # its prediction differs from L1BTB." 9 | l1btb: 1024-entry L1 BTB, zero bubbles 10 | 11 | # Source: Chips and Cheese, AMD 12 | l2btb: 6656-entry L2 BTB, three bubbles 13 | 14 | # Source: Chips and Cheese, AMD 15 | # "The processor implements a 1536-entry indirect target array" 16 | indir: 1536-entry Indirect Target Array 17 | 18 | # Source: Chips and Cheese, AMD 19 | # "The processor implements a 32-entry return address stack (RAS) per thread" 20 | ras: 32-entry Return Address Stack 21 | 22 | # Source: AMD 23 | penalty: 11-18 cycle branch misprediction latency, 13 typical 24 | } 25 | 26 | l1ic: L1 IC { 27 | # Source: Chips and Cheese, AMD 28 | l1itlb: 64-entry L1 ITLB 29 | 30 | # Source: Chips and Cheese, AMD 31 | l2itlb: 512-entry L2 ITLB 32 | 33 | # Source: Chips and Cheese, AMD 34 | l1ic: 32KB 8-way L1 IC 35 | } 36 | 37 | # Source: Chips and Cheese 38 | fq: 64-entry Fetch Queue 39 | bp -> fq 40 | fq -> l1ic 41 | 42 | # Source: AMD 43 | # "The fetch unit sends these bytes to the decode unit through a 24 entry 44 | # Instruction Byte Queue (IBQ), each entry holding 16 instruction bytes. In 45 | # SMT mode each thread has 12 dedicated IBQ entries." 46 | iq: 24x 16B Instruction Byte Queue 47 | 48 | # Source: AMD 49 | # "The processor fetches instructions from the instruction cache in 32-byte 50 | # blocks that are 16-byte aligned and contained within a 64-byte aligned 51 | # block. The processor can perform such a 32-byte fetch every cycle." 52 | l1ic -> iq: 32B/cycle 53 | 54 | # Source: Chips and Cheese, AMD 55 | # AMD: "The decode unit scans two of these IBQ entries in a given cycle, 56 | # decoding a maximum of four instructions." 57 | decode: 4-way Decode 58 | iq -> decode: 2 IBQ entry 59 | 60 | # Source: Chips and Cheese, AMD 61 | # "The op cache is organized as an associative cache with 64 sets and 8 62 | # ways. At each set-way intersection is an entry containing up to 8 macro 63 | # ops." 64 | uopc: 512-entry 8-way, 8 mop/entry UOP Cache 65 | decode -> uopc 66 | bp -> uopc 67 | 68 | # Source: Chips and Cheese 69 | uop: UOP Queue 70 | uopc -> uop: 8 macro ops/cycle 71 | decode -> uop: 4 instructions/cycle 72 | 73 | # Source: Chips and Cheese 74 | rename: 6-way Rename { 75 | Move Elimination 76 | Zero Idiom 77 | } 78 | uop -> rename 79 | } 80 | 81 | backend: Backend { 82 | # Source: Chips and Cheese, AMD 83 | # "The unit can receive up to 6 macro ops dispatched per cycle and track up 84 | # to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode." 85 | rob: 256-entry ROB 86 | 87 | # Source: Chips and Cheese 88 | bob: 48-taken-entry 117-not-taken-entry Branch Order Buffer 89 | 90 | rf: Register File { 91 | # Source: Chips and Cheese, AMD 92 | # "The integer physical register file (PRF) consists of 192 registers, 93 | # with up to 38 per thread mapped to architectural state or 94 | # micro-architectural temporary state." 95 | irf: 192-entry Integer Register File 96 | 97 | # Source: Chips and Cheese 98 | flagsrf: 122-entry Flags Register File 99 | 100 | # Source: Chips and Cheese, AMD 101 | vrf: 160-entry 256b Vector Register File 102 | } 103 | 104 | # Source: Chips and Cheese, AMD 105 | # Source: AMD, 96 integer scheduler entries 106 | sched1: 24-entry Scheduler \#1 107 | 108 | # Source: Chips and Cheese 109 | pipe1: Pipe \#1 { 110 | ALU 111 | CMOV 112 | } 113 | sched1 -> rf -> pipe1 114 | 115 | # Source: Chips and Cheese 116 | pipe2: Pipe \#2 { 117 | Branch 118 | } 119 | sched1 -> rf -> pipe2 120 | 121 | # Source: Chips and Cheese, AMD 122 | sched2: 24-entry Scheduler \#2 123 | 124 | # Source: Chips and Cheese 125 | pipe3: Pipe \#3 { 126 | ALU 127 | } 128 | sched2 -> rf -> pipe3 129 | 130 | # Source: Chips and Cheese 131 | pipe4: Pipe \#4 { 132 | AGU 133 | } 134 | sched2 -> rf -> pipe4 135 | 136 | # Source: Chips and Cheese, AMD 137 | sched3: 24-entry Scheduler \#3 138 | 139 | # Source: Chips and Cheese 140 | pipe5: Pipe \#5 { 141 | ALU 142 | INT MUL 143 | PDEP 144 | CRC 145 | } 146 | sched3 -> rf -> pipe5 147 | 148 | # Source: Chips and Cheese 149 | pipe6: Pipe \#6 { 150 | AGU 151 | } 152 | sched3 -> rf -> pipe6 153 | 154 | # Source: Chips and Cheese, AMD 155 | sched4: 24-entry Scheduler \#4 156 | 157 | # Source: Chips and Cheese 158 | pipe7: Pipe \#7 { 159 | ALU 160 | INT DIV 161 | CMOV 162 | Branch 163 | } 164 | sched4 -> rf -> pipe7 165 | 166 | # Source: Chips and Cheese 167 | pipe8: Pipe \#8 { 168 | AGU 169 | } 170 | sched4 -> rf -> pipe8 171 | 172 | lsu: LSU { 173 | # Source: Chips and Cheese, AMD 174 | # 116-entry Load Queue reported by Chips and Cheese 175 | # 72-entry Load Queue reported by AMD 176 | # "The LS unit can process up to 72 out-of-order loads." 177 | # "The LS unit utilizes a 64-entry store queue (STQ)." 178 | 72-entry Load Queue 179 | 64-entry Store Queue 180 | 181 | # Source: AMD 182 | # 3 loads per cycle (max 2 if 256b) 183 | # 2 stores per cycle (max 1 if 256b) 184 | # Max 3 total memory ops 185 | # "The LS unit contains three largely independent pipelines enabling the 186 | # execution of three 256-bit memory operations per cycle. All three memory 187 | # operations can be loads, with a separate maximum of two 128- or 188 | # 256-bit loads. A maximum of two of the memory operations can be stores, 189 | # with a maximum of one store if the store is a 128- or 256-bit store." 190 | 3 Load per cycle (max 2 if 128b/256b) 191 | 2 Store per cycle (max 1 if 128b/256b) 192 | 193 | # Source: jiegec 194 | # Source: AMD 195 | # "4-cycle load-to-use integer load latency and 7-cycle load-to-use FP load latency." 196 | 4 cycle load to use latency 197 | 7 cycle load to FP use latency 198 | } 199 | 200 | pipe4 -> lsu 201 | pipe6 -> lsu 202 | pipe8 -> lsu 203 | 204 | rob -> sched1 205 | rob -> sched2 206 | rob -> sched3 207 | rob -> sched4 208 | 209 | # Source: Chips and Cheese, AMD 210 | nsq: 64-entry Non/Pre-Scheduling Queue 211 | rob -> nsq 212 | 213 | # Source: Chips and Cheese, AMD 214 | sched5: 32-entry Scheduler \#5 215 | nsq -> sched5 216 | 217 | # Source: Chips and Cheese 218 | pipe9: Pipe \#9 { 219 | FMA 220 | FMisc 221 | INT Vec ALU 222 | INT Vec MUL 223 | AES 224 | } 225 | sched5 -> rf -> pipe9 226 | 227 | # Source: Chips and Cheese 228 | pipe10: Pipe \#10 { 229 | FADD 230 | FMisc 231 | INT Vec ALU 232 | Shuffle 233 | Convert 234 | } 235 | sched5 -> rf -> pipe10 236 | 237 | # Source: Chips and Cheese 238 | pipe11: Pipe \#11 { 239 | FStore 240 | F2I 241 | } 242 | sched5 -> rf -> pipe11 -> lsu 243 | 244 | # Source: Chips and Cheese, AMD 245 | sched6: 32-entry Scheduler \#6 246 | nsq -> sched6 247 | 248 | # Source: Chips and Cheese 249 | pipe12: Pipe \#12 { 250 | FMA 251 | FMisc 252 | INT Vec ALU 253 | Shuffle 254 | AES 255 | Vec Shift 256 | FDIV 257 | } 258 | sched6 -> rf -> pipe12 259 | 260 | # Source: Chips and Cheese 261 | pipe13: Pipe \#13 { 262 | FADD 263 | FMisc 264 | INT Vec ALU 265 | INT Vec MUL 266 | } 267 | sched6 -> rf -> pipe13 268 | 269 | # Source: Chips and Cheese, AMD 270 | pipe14: Pipe \#14 { 271 | # FStore + F2I reported by Chips and Cheese 272 | # FStore reported by AMD 273 | FStore 274 | } 275 | sched6 -> rf -> pipe14 -> lsu 276 | } 277 | frontend.rename -> backend.rob 278 | frontend.rename -> backend.bob 279 | 280 | mem: Memory { 281 | l1: L1 DC { 282 | # Source: Chips and Cheese, AMD 283 | l1dtlb: 64-entry L1 DTLB 284 | 285 | # Source: Chips and Cheese, AMD 286 | l2dtlb: 2048-entry L2 DTLB 287 | 288 | # Source: Chips and Cheese, AMD 289 | l1dc: 32KB 8-way L1DC 290 | 291 | # Source: AMD 292 | ptw: 6 Page Table Walkers 293 | } 294 | 295 | l2: L2 { 296 | # Source: Chips and Cheese, AMD 297 | l2dc: 512KB 8-way L2 Cache, 12 cycle latency 298 | } 299 | 300 | # Source: Chips and Cheese, AMD 301 | l1 -> l2: 32B/cycle 302 | 303 | l3: L3 { 304 | # Source: Chips and Cheese, AMD 305 | l3dc: 32MB (4MB/core) 16-way L3 Cache, 46 cycle latency 306 | } 307 | l2 -> l3 308 | } 309 | frontend.l1ic -> mem.l2 310 | backend.lsu -> mem.l1 311 | 312 | info: |md 313 | Drawn by Jiajie Chen @jiegec 314 | 315 | Based on data from Chips and Cheese, AMD 316 | | 317 | } -------------------------------------------------------------------------------- /docs/zen3.md: -------------------------------------------------------------------------------- 1 | # AMD Zen3 2 | 3 | ![](./zen3.svg) 4 | 5 | References: 6 | 7 | - [AMD Next Generation "Zen 4" Core and 4th Gen AMD EPYC(TM) 9004 Server CPU](https://hc2023.hotchips.org/assets/program/conference/day1/CPU1/HC_Zen4_Epyc_Final_20230825%20-%20Embargoed%20until%20Aug%2029%202023.pdf) 8 | - [AND Next Generation "Zen 3" Core](https://hc33.hotchips.org/assets/program/conference/day1/HC2021.C1.2%20AMD%20Mark%20Evers.pdf) 9 | - [AMD’s Zen 4 Part 1: Frontend and Execution Engine](https://chipsandcheese.com/2022/11/05/amds-zen-4-part-1-frontend-and-execution-engine/) 10 | - [AMD Zen 3 Ryzen Deep Dive Review: 5950X, 5900X, 5800X and 5600X Tested](https://www.anandtech.com/show/16214/amd-zen-3-ryzen-deep-dive-review-5950x-5900x-5800x-and-5700x-tested/4) 11 | 12 | -------------------------------------------------------------------------------- /docs/zen4.d2: -------------------------------------------------------------------------------- 1 | cpu : AMD Zen4 CPU { 2 | frontend: Frontend { 3 | bp: Branch Predictor { 4 | # Source: Chips and Cheese, AMD 5 | # "The L1 BTB has 1536 entries and predicts with zero prediction bubbles 6 | # for conditional and unconditional direct branches, and one cycle bubble 7 | # for calls, returns and indirect branches. The L2 BTB has 7680 entries 8 | # and creates three prediction bubbles if its prediction differs from that 9 | # of the L1 BTB." 10 | l1btb: 1536-entry L1 BTB, zero bubble 11 | 12 | # Source: Chips and Cheese, AMD 13 | l2btb: 7168/7680-entry L2 BTB, three bubble 14 | 15 | # Source: Chips and Cheese, AMD 16 | # "The processor implements a 3072-entry indirect target array" 17 | indir: 3072-entry Indirect Target Array 18 | 19 | # Source: Chips and Cheese, AMD 20 | # "The processor implements a 32-entry return address stack (RAS) per thread" 21 | ras: 32-entry Return Address Stack 22 | 23 | # Source: AMD 24 | 2 taken predictions per cycle 25 | } 26 | 27 | l1ic: L1 IC { 28 | # Source: Chips and Cheese, AMD 29 | l1itlb: 64-entry L1 ITLB 30 | 31 | # Source: Chips and Cheese, AMD 32 | l2itlb: 512-entry L2 ITLB 33 | 34 | # Source: Chips and Cheese, AMD 35 | l1ic: 32KB 8-way L1 IC 36 | } 37 | 38 | fq: Fetch Queue 39 | bp -> fq 40 | fq -> l1ic 41 | 42 | # Source: AMD 43 | # "The fetch unit sends these bytes to the decode unit through a 24 entry 44 | # Instruction Byte Queue (IBQ), each entry holding 16 instruction bytes. In 45 | # SMT mode each thread has 12 dedicated IBQ entries." 46 | iq: 24x 16B Instruction Byte Queue 47 | 48 | # Source: AMD 49 | # "The processor fetches instructions from the instruction cache in 32-byte 50 | # blocks that are 16-byte aligned and contained within a 64-byte aligned 51 | # block. The processor can perform a 32-byte fetch every cycle." 52 | l1ic -> iq: 32B/cycle 53 | 54 | # Source: Chips and Cheese, AMD 55 | # AMD: "The decode unit scans two of these IBQ entries in a given cycle, 56 | # decoding a maximum of four instructions." 57 | decode: 4-way Decode 58 | iq -> decode: 2x IBQ entry 59 | 60 | # Source: Chips and Cheese, AMD 61 | # "The Op Cache is organized as an associative cache with 64 sets and 12 62 | # ways. At each set-way intersection is an entry containing up to 9 macro 63 | # ops." 64 | uopc: 768-entry 12-way, 9 mops/entry UOP Cache 65 | decode -> uopc 66 | bp -> uopc 67 | 68 | # Source: Chips and Cheese 69 | uop: UOP Queue 70 | uopc -> uop 71 | decode -> uop 72 | 73 | # Source: Chips and Cheese 74 | rename: 6-way Rename { 75 | Move Elimination 76 | Zero Idiom 77 | } 78 | uop -> rename 79 | } 80 | 81 | backend: Backend { 82 | # Source: Chips and Cheese, AMD 83 | # "The unit can receive up to 6 macro ops dispatched per cycle and track up 84 | # to 320 macro ops in-flight in non- SMT mode or 160 per thread in SMT mode" 85 | rob: 320-entry ROB 86 | 87 | # Source: Chips and Cheese 88 | bob: 62-taken-entry 118-not-taken-entry Branch Order Buffer 89 | 90 | rf: Register File { 91 | # Source: Chips and Cheese, AMD 92 | # "The integer physical register file (PRF) consists of 224 registers, 93 | # with up to 38 per thread mapped to architectural state or 94 | # micro-architectural temporary state. " 95 | irf: 224-entry Integer Register File 96 | 97 | # Source: Chips and Cheese, AMD 98 | # 238 reported by Chips and Cheese 99 | # 126 reported by AMD 100 | flagsrf: 126-entry Flags Register File 101 | 102 | # Source: Chips and Cheese, AMD 103 | vmaskrf: (52+16)-entry Vector Mask Register File 104 | 105 | # Source: Chips and Cheese, AMD 106 | vrf: 192-entry 512b Vector Register File 107 | } 108 | 109 | # Source: Chips and Cheese, AMD 110 | sched1: 24-entry Scheduler \#1 111 | 112 | # Source: Chips and Cheese 113 | pipe1: Pipe \#1 { 114 | ALU 115 | } 116 | sched1 -> rf -> pipe1 117 | 118 | # Source: Chips and Cheese 119 | pipe2: Pipe \#2 { 120 | Branch 121 | } 122 | sched1 -> rf -> pipe2 123 | 124 | # Source: Chips and Cheese, AMD 125 | sched2: 24-entry Scheduler \#2 126 | 127 | # Source: Chips and Cheese 128 | pipe3: Pipe \#3 { 129 | ALU 130 | } 131 | sched2 -> rf -> pipe3 132 | 133 | # Source: Chips and Cheese 134 | pipe4: Pipe \#4 { 135 | AGU 136 | } 137 | sched2 -> rf -> pipe4 138 | 139 | # Source: Chips and Cheese, AMD 140 | sched3: 24-entry Scheduler \#3 141 | 142 | # Source: Chips and Cheese 143 | pipe5: Pipe \#5 { 144 | ALU 145 | } 146 | sched3 -> rf -> pipe5 147 | 148 | # Source: Chips and Cheese 149 | pipe6: Pipe \#6 { 150 | AGU 151 | } 152 | sched3 -> rf -> pipe6 153 | 154 | # Source: Chips and Cheese, AMD 155 | sched4: 24-entry Scheduler \#4 156 | 157 | # Source: Chips and Cheese 158 | pipe7: Pipe \#7 { 159 | ALU 160 | Branch 161 | } 162 | sched4 -> rf -> pipe7 163 | 164 | # Source: Chips and Cheese 165 | pipe8: Pipe \#8 { 166 | AGU 167 | } 168 | sched4 -> rf -> pipe8 169 | 170 | lsu: LSU { 171 | # Source: Chips and Cheese, AMD 172 | # 136 (Load Validation Queue) & 88 (Load Execution Queue) reported by Chips and Cheese 173 | # 88 reported by AMD 174 | # "The LS can track up to 48 uncompleted loads and up to 88 completed 175 | # loads." 176 | 136-entry Load Validation Queue 177 | 88-entry Load (Execution) Queue 178 | # "The LS unit utilizes a 64-entry store queue (STQ) which holds stores 179 | # from dispatch until the store data can be written to the data cache." 180 | 64-entry Store Queue 181 | 182 | # Source: AMD 183 | # 3 loads per cycle (max 2 if 256b) 184 | # 2 stores per cycle (max 1 if 256b) 185 | 3 Load per cycle (2 if 256b) 186 | 2 Store per cycle (1 if 256b) 187 | 188 | # Source: AMD 189 | # "4-cycle load-to-use integer load latency and 7-cycle load-to-use FP load latency." 190 | 4 cycle load to use latency 191 | 7 cycle load to FP use latency 192 | } 193 | 194 | pipe4 -> lsu 195 | pipe6 -> lsu 196 | pipe8 -> lsu 197 | 198 | rob -> sched1 199 | rob -> sched2 200 | rob -> sched3 201 | rob -> sched4 202 | 203 | # Source: Chips and Cheese, AMD 204 | nsq: 64-entry Non/Pre-Scheduling Queue 205 | rob -> nsq 206 | 207 | # Source: Chips and Cheese, AMD 208 | sched5: 32-entry Scheduler \#5 209 | nsq -> sched5 210 | 211 | # Source: Chips and Cheese 212 | pipe9: Pipe \#9 { 213 | FMA 214 | } 215 | sched5 -> rf -> pipe9 216 | 217 | # Source: Chips and Cheese 218 | pipe10: Pipe \#10 { 219 | FADD 220 | } 221 | sched5 -> rf -> pipe10 222 | 223 | # Source: Chips and Cheese 224 | pipe11: Pipe \#11 { 225 | FStore 226 | F2I 227 | } 228 | sched5 -> rf -> pipe11 -> lsu 229 | 230 | # Source: Chips and Cheese, AMD 231 | sched6: 32-entry Scheduler \#6 232 | nsq -> sched6 233 | 234 | # Source: Chips and Cheese 235 | pipe12: Pipe \#12 { 236 | FMA 237 | } 238 | sched6 -> rf -> pipe12 239 | 240 | # Source: Chips and Cheese 241 | pipe13: Pipe \#13 { 242 | FADD 243 | } 244 | sched6 -> rf -> pipe13 245 | 246 | # Source: Chips and Cheese, AMD 247 | pipe14: Pipe \#14 { 248 | # FStore + F2I reported by Chips and Cheese 249 | # FStore reported by AMD 250 | FStore 251 | } 252 | sched6 -> rf -> pipe14 -> lsu 253 | } 254 | frontend.rename -> backend.rob 255 | frontend.rename -> backend.bob 256 | 257 | mem: Memory { 258 | l1: L1 DC { 259 | # Source: Chips and Cheese, AMD 260 | l1dtlb: 72-entry L1 DTLB 261 | 262 | # Source: Chips and Cheese, AMD 263 | l2dtlb: 3072-entry L2 DTLB 264 | 265 | # Source: Chips and Cheese, AMD 266 | l1dc: 32KB 8-way L1DC 267 | 268 | # Source: AMD 269 | # "The AMD Zen4 microarchitecture has six hardware page table walkers to 270 | # handle L2 TLB misses." 271 | ptw: 6 Page Table Walkers 272 | } 273 | 274 | l2: L2 { 275 | # Source: Chips and Cheese, AMD 276 | l2dc: 1MB 8-way L2 Cache, 14 cycle latency 277 | } 278 | 279 | # Source: Chips and Cheese, AMD 280 | l1 -> l2: 32B/cycle 281 | 282 | l3: L3 { 283 | # Source: Chips and Cheese, AMD 284 | l3dc: 32MB (4MB/core) 16-way L3 Cache, 50 cycle latency 285 | } 286 | l2 -> l3 287 | } 288 | frontend.l1ic -> mem.l2 289 | backend.lsu -> mem.l1 290 | 291 | info: |md 292 | Drawn by Jiajie Chen @jiegec 293 | 294 | Based on data from Chips and Cheese, Anandtech and AMD 295 | | 296 | } -------------------------------------------------------------------------------- /docs/zen4.md: -------------------------------------------------------------------------------- 1 | # AMD Zen4 2 | 3 | ![](./zen4.svg) 4 | 5 | References: 6 | 7 | - [AMD Zen 4 Ryzen 9 7950X and Ryzen 5 7600X Review: Retaking The High-End](https://www.anandtech.com/show/17585/amd-zen-4-ryzen-9-7950x-and-ryzen-5-7600x-review-retaking-the-high-end/8) 8 | - [AMD’s Zen 4 Part 1: Frontend and Execution Engine](https://chipsandcheese.com/2022/11/05/amds-zen-4-part-1-frontend-and-execution-engine/) 9 | - [AMD’s Zen 4, Part 2: Memory Subsystem and Conclusion](https://chipsandcheese.com/2022/11/08/amds-zen-4-part-2-memory-subsystem-and-conclusion/) 10 | - [Inside the AMD Zen 4 CPU architecture](https://www.custompc.com/inside-amd-zen-4-ryzen-cpu-architecture) 11 | - [AMD Reveals More Zen 5 CPU Core Details](https://www.phoronix.com/review/amd-zen-5-core) 12 | - [AMD Next Generation "Zen 4" Core and 4th Gen AMD EPYC(TM) 9004 Server CPU](https://hc2023.hotchips.org/assets/program/conference/day1/CPU1/HC_Zen4_Epyc_Final_20230825%20-%20Embargoed%20until%20Aug%2029%202023.pdf) 13 | - [AMD Next-Generation “Zen 4” Core and 4th Gen AMD EPYC Server CPUs](https://ieeexplore.ieee.org/document/10466769) 14 | 15 | -------------------------------------------------------------------------------- /docs/zen5.md: -------------------------------------------------------------------------------- 1 | # AMD Zen5 2 | 3 | ![](./zen5.svg) 4 | 5 | References: 6 | 7 | - [AMD Reveals More Zen 5 CPU Core Details](https://www.phoronix.com/review/amd-zen-5-core) 8 | - [Zen 5’s 2-Ahead Branch Predictor Unit: How a 30 Year Old Idea Allows for New Tricks](https://chipsandcheese.com/2024/07/26/zen-5s-2-ahead-branch-predictor-unit-how-30-year-old-idea-allows-for-new-tricks/) 9 | - [Zen 5’s Leaked Slides](https://chipsandcheese.com/2023/10/08/zen-5s-leaked-slides/) 10 | - [AMD’s Strix Point: Zen 5 Hits Mobile](https://chipsandcheese.com/2024/08/10/amds-strix-point-zen-5-hits-mobile/) 11 | - [AMD’s Ryzen 9950X: Zen 5 on Desktop](https://chipsandcheese.com/2024/08/14/amds-ryzen-9950x-zen-5-on-desktop/) 12 | - [Zen 5 Dieshot](https://www.bilibili.com/opus/965843745820901377) 13 | - [Strix Point Dieshot](https://www.bilibili.com/opus/959217298443337751) 14 | - [Software Optimization Guide for the AMD Zen5 Microarchitecture](https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/software-optimization-guides/58455.zip) 15 | - [Discussing AMD’s Zen 5 at Hot Chips 2024](https://chipsandcheese.com/2024/09/15/discussing-amds-zen-5-at-hot-chips-2024/) 16 | - [AMD EPYC 9965 "Turin Dense" Delivers Better Performance/Power Efficiency vs. AmpereOne 192-Core ARM CPU](https://www.phoronix.com/review/amd-epyc-9965-ampereone) 17 | - [AMD EPYC 9755 / 9575F / 9965 Benchmarks Show Dominating Performance](https://www.phoronix.com/review/amd-epyc-9965-9755-benchmarks) 18 | - [5TH GEN AMD EPYC™ PROCESSOR ARCHITECTURE](https://www.amd.com/content/dam/amd/en/documents/epyc-business-docs/white-papers/5th-gen-amd-epyc-processor-architecture-white-paper.pdf) 19 | - [The AMD Zen 5 Microarchitecture: Powering Ryzen AI 300 Series For Mobile and Ryzen 9000 for Desktop](https://www.anandtech.com/show/21469/amd-details-ryzen-ai-300-series-for-mobile-strix-point-with-rdna-35-igpu-xdna-2-npu) 20 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | docs/main.py -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: CPU Microarchitecture Diagrams 2 | site_url: https://jia.je/cpu 3 | repo_url: https://github.com/jiegec/cpu 4 | edit_uri: edit/master/docs/ 5 | nav: 6 | - CPU Microarchitecture Diagrams: index.md 7 | - Comparisons across microarchitectures: comparison.md 8 | - Reverse engineered conditional branch predictors: cbp.md 9 | - AArch64: 10 | - ARM Cortex-A75: cortex_a75.md 11 | - ARM Cortex-A77: cortex_a77.md 12 | - ARM Cortex-X1: cortex_x1.md 13 | - ARM Cortex-X2: cortex_x2.md 14 | - ARM Cortex-X3: cortex_x3.md 15 | - ARM Cortex-X4: cortex_x4.md 16 | - ARM Cortex-X925: cortex_x925.md 17 | - ARM Neoverse-V2: neoverse_v2.md 18 | - Ampere One: ampere_one.md 19 | - Apple M1 P-core (Firestorm): firestorm.md 20 | - Apple M2 P-core (Avalanche): avalanche.md 21 | - Apple M3 P-core: m3_pcore.md 22 | - Apple M4 P-core: m4_pcore.md 23 | - Phytium Xiaomi: xiaomi.md 24 | - Qualcomm Oryon: oryon.md 25 | - AMD64: 26 | - AMD Zen1: zen1.md 27 | - AMD Zen2: zen2.md 28 | - AMD Zen3: zen3.md 29 | - AMD Zen4: zen4.md 30 | - AMD Zen5: zen5.md 31 | - Intel Skylake: skylake.md 32 | - Intel Ice Lake (Sunny Cove): sunny_cove.md 33 | - Intel Alder Lake P-core (Golden Cove): golden_cove.md 34 | - Intel Alder Lake E-core (Gracemont): gracemont.md 35 | - Intel Meteor Lake P-core (Redwood Cove): redwood_cove.md 36 | - Intel Meteor Lake E-core (Crestmont): crestmont.md 37 | - Intel Lunar Lake P-core (Lion Cove): lion_cove.md 38 | - Intel Lunar Lake E-core (Skymont): skymont.md 39 | - LoongArch64: 40 | - Loongson 3A6000: 3a6000.md 41 | - RISC-V64: 42 | - SiFive P550: p550.md 43 | - SiFive P870: p870.md 44 | - Dieshot: dieshot.md 45 | theme: 46 | name: material 47 | icon: 48 | repo: fontawesome/brands/github 49 | features: 50 | - content.action.edit 51 | - navigation.expand 52 | - navigation.tracking 53 | - navigation.sections 54 | - search.suggest 55 | - search.highlight 56 | - search.share 57 | plugins: 58 | - git-revision-date-localized: 59 | enable_creation_date: true 60 | - glightbox: 61 | height: 100vh 62 | - search 63 | - offline 64 | - social: 65 | cards_layout_options: 66 | font_family: Noto Sans SC 67 | - macros 68 | markdown_extensions: 69 | - pymdownx.superfences: 70 | custom_fences: 71 | - name: mermaid 72 | class: mermaid 73 | format: !!python/name:pymdownx.superfences.fence_code_format 74 | extra: 75 | analytics: 76 | provider: google 77 | property: G-3109FRSVTT 78 | copyright: Copyright © 2024 Jiajie Chen 79 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "cpu" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Jiajie Chen "] 6 | readme = "README.md" 7 | package-mode = false 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.9" 11 | mkdocs = "^1.6.0" 12 | mkdocs-material = "^9.5.30" 13 | mkdocs-glightbox = "^0.4.0" 14 | mkdocs-git-revision-date-localized-plugin = "^1.2.6" 15 | cairosvg = "^2.7.1" 16 | mkdocs-macros-plugin = "^1.0.5" 17 | pandas = "^2.2.2" 18 | tabulate = "^0.9.0" 19 | 20 | 21 | [build-system] 22 | requires = ["poetry-core"] 23 | build-backend = "poetry.core.masonry.api" 24 | --------------------------------------------------------------------------------