├── .github
    └── workflows
    │   └── publish.yml
├── .gitignore
├── README.md
├── data
    ├── firestorm
    │   ├── btb.csv
    │   ├── cache.txt
    │   ├── lsu.csv
    │   └── ras.csv
    ├── geekerwan
    │   ├── a14_a16_BV1gm4y157No.png
    │   ├── a16_a17_BV1gm4y157No.png
    │   ├── a17_BV1gm4y157No.png
    │   ├── a17_ecore_BV1gm4y157No.png
    │   └── m2_m3_BV1NJ4m1w7zk.png
    ├── golden_cove
    │   ├── bob.csv
    │   ├── ras.csv
    │   ├── rf.csv
    │   ├── rob.csv
    │   └── sched.csv
    └── zen2
    │   └── lsu.csv
├── docs
    ├── 3a6000.d2
    ├── 3a6000.md
    ├── Makefile
    ├── ampere_one.d2
    ├── ampere_one.md
    ├── avalanche.d2
    ├── avalanche.md
    ├── cbp.md
    ├── comparison.md
    ├── cortex_a75.d2
    ├── cortex_a75.md
    ├── cortex_a77.d2
    ├── cortex_a77.md
    ├── cortex_x1.d2
    ├── cortex_x1.md
    ├── cortex_x2.d2
    ├── cortex_x2.md
    ├── cortex_x3.d2
    ├── cortex_x3.md
    ├── cortex_x4.d2
    ├── cortex_x4.md
    ├── cortex_x925.d2
    ├── cortex_x925.md
    ├── crestmont.d2
    ├── crestmont.md
    ├── dieshot.md
    ├── firestorm.d2
    ├── firestorm.md
    ├── golden_cove.d2
    ├── golden_cove.md
    ├── gracemont.d2
    ├── gracemont.md
    ├── index.md
    ├── lion_cove.d2
    ├── lion_cove.md
    ├── m3_pcore.d2
    ├── m3_pcore.md
    ├── m4_pcore.d2
    ├── m4_pcore.md
    ├── main.py
    ├── neoverse_n2.md
    ├── neoverse_v2.d2
    ├── neoverse_v2.md
    ├── oryon.d2
    ├── oryon.md
    ├── p550.d2
    ├── p550.md
    ├── p870.d2
    ├── p870.md
    ├── redwood_cove.d2
    ├── redwood_cove.md
    ├── skylake.d2
    ├── skylake.md
    ├── skymont.d2
    ├── skymont.md
    ├── sunny_cove.d2
    ├── sunny_cove.md
    ├── uarch.csv
    ├── xiaomi.d2
    ├── xiaomi.md
    ├── zen1.d2
    ├── zen1.md
    ├── zen2.d2
    ├── zen2.md
    ├── zen3.d2
    ├── zen3.md
    ├── zen4.d2
    ├── zen4.md
    ├── zen5.d2
    └── zen5.md
├── main.py
├── mkdocs.yml
├── poetry.lock
└── pyproject.toml


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: publish
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 | permissions:
 7 |   contents: write
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |         with:
14 |           fetch-depth: 0 # for git history
15 |       - uses: actions/cache@v4
16 |         with:
17 |           key: ${{ github.ref }}
18 |           path: .cache
19 |       - uses: Gr1N/setup-poetry@v9
20 |       - run: curl -fsSL https://d2lang.com/install.sh | sh -s --
21 |       - run: poetry install
22 |       - run: cd docs && make
23 |       - run: poetry run mkdocs gh-deploy --force
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .cache/
2 | site/
3 | *.svg
4 | __pycache__
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CPU Microarchitecture Diagrams
2 | 
3 | See <https://jia.je/cpu> to find microarchitecture diagrams of several CPUs.
4 | 


--------------------------------------------------------------------------------
/data/firestorm/btb.csv:
--------------------------------------------------------------------------------
  1 | size,stride,min,avg,max
  2 | 1,4,2.55,2.59,2.62
  3 | 2,4,1.80,1.81,1.81
  4 | 3,4,1.53,1.54,1.56
  5 | 4,4,1.40,1.40,1.41
  6 | 5,4,1.32,1.32,1.33
  7 | 6,4,1.27,1.27,1.27
  8 | 7,4,1.23,1.23,1.23
  9 | 8,4,1.20,1.20,1.21
 10 | 9,4,1.18,1.18,1.18
 11 | 10,4,1.16,1.16,1.17
 12 | 11,4,1.15,1.15,1.15
 13 | 12,4,1.13,1.13,1.14
 14 | 13,4,1.12,1.12,1.12
 15 | 14,4,1.11,1.12,1.12
 16 | 15,4,1.11,1.11,1.11
 17 | 16,4,1.10,1.10,1.10
 18 | 17,4,1.10,1.10,1.10
 19 | 19,4,1.09,1.27,1.63
 20 | 20,4,1.08,1.08,1.08
 21 | 21,4,1.08,1.08,1.08
 22 | 24,4,1.07,1.07,1.07
 23 | 25,4,1.06,1.08,1.09
 24 | 26,4,1.06,1.06,1.06
 25 | 30,4,1.05,1.05,1.06
 26 | 31,4,1.05,1.05,1.05
 27 | 32,4,1.05,1.05,1.05
 28 | 33,4,1.05,1.05,1.05
 29 | 39,4,1.04,1.04,1.04
 30 | 40,4,1.04,1.04,1.04
 31 | 41,4,1.04,1.04,1.04
 32 | 49,4,1.03,1.03,1.03
 33 | 50,4,1.03,1.03,1.03
 34 | 51,4,1.03,1.03,1.03
 35 | 62,4,1.03,1.03,1.03
 36 | 63,4,1.03,1.31,1.50
 37 | 64,4,1.02,1.03,1.03
 38 | 65,4,1.02,1.02,1.02
 39 | 80,4,1.02,1.02,1.02
 40 | 81,4,1.02,1.02,1.02
 41 | 82,4,1.02,1.02,1.02
 42 | 101,4,1.02,1.02,1.02
 43 | 102,4,1.02,1.02,1.02
 44 | 103,4,1.02,1.02,1.02
 45 | 127,4,1.01,1.01,1.01
 46 | 128,4,1.01,1.01,1.02
 47 | 129,4,1.01,1.01,1.01
 48 | 160,4,1.01,1.01,1.01
 49 | 161,4,1.01,1.02,1.05
 50 | 162,4,1.01,1.01,1.02
 51 | 202,4,1.01,1.01,1.01
 52 | 203,4,1.01,1.01,1.01
 53 | 204,4,1.01,1.01,1.01
 54 | 255,4,1.01,1.01,1.01
 55 | 256,4,1.01,1.02,1.05
 56 | 257,4,1.01,1.01,1.01
 57 | 322,4,1.01,1.01,1.01
 58 | 323,4,1.01,1.01,1.01
 59 | 324,4,1.01,1.01,1.04
 60 | 406,4,1.00,1.00,1.00
 61 | 407,4,1.00,1.00,1.00
 62 | 408,4,1.00,1.00,1.00
 63 | 511,4,1.00,1.01,1.03
 64 | 512,4,1.00,1.00,1.00
 65 | 513,4,1.00,1.00,1.00
 66 | 644,4,1.00,1.01,1.05
 67 | 645,4,1.00,1.00,1.00
 68 | 646,4,1.00,1.00,1.00
 69 | 812,4,1.00,1.00,1.00
 70 | 813,4,1.00,1.00,1.01
 71 | 814,4,1.00,1.00,1.00
 72 | 1023,4,1.00,1.00,1.00
 73 | 1024,4,1.00,1.00,1.00
 74 | 1025,4,1.00,1.00,1.00
 75 | 1289,4,1.84,1.84,1.85
 76 | 1290,4,1.85,1.85,1.85
 77 | 1291,4,1.85,1.86,1.87
 78 | 1624,4,2.91,2.91,2.91
 79 | 1625,4,2.91,2.91,2.92
 80 | 1626,4,2.92,2.92,2.93
 81 | 2046,4,2.88,2.88,2.89
 82 | 2047,4,2.88,2.88,2.89
 83 | 2048,4,2.88,2.89,2.89
 84 | 2049,4,2.88,2.89,2.92
 85 | 2579,4,2.88,2.89,2.89
 86 | 2580,4,2.89,2.89,2.89
 87 | 2581,4,2.89,2.89,2.91
 88 | 3250,4,2.88,2.89,2.89
 89 | 3251,4,2.88,2.88,2.89
 90 | 3252,4,2.88,2.88,2.89
 91 | 4095,4,2.89,2.89,2.89
 92 | 4096,4,2.88,2.88,2.89
 93 | 4097,4,2.88,2.89,2.89
 94 | 5160,4,3.00,3.00,3.00
 95 | 5161,4,3.00,3.00,3.00
 96 | 5162,4,3.00,3.01,3.01
 97 | 6501,4,2.96,2.97,2.97
 98 | 6502,4,2.96,2.96,2.97
 99 | 6503,4,2.96,2.97,2.97
100 | 8191,4,3.00,3.00,3.01
101 | 8192,4,3.00,3.01,3.02
102 | 8193,4,3.00,3.00,3.01
103 | 10320,4,3.00,3.01,3.02
104 | 10321,4,3.00,3.00,3.01
105 | 10322,4,3.00,3.01,3.02
106 | 13003,4,3.00,3.00,3.00
107 | 13004,4,3.00,3.00,3.01
108 | 13005,4,3.00,3.00,3.01
109 | 16383,4,3.00,3.01,3.04
110 | 16384,4,3.00,3.01,3.03
111 | 16385,4,3.01,3.01,3.01
112 | 20642,4,3.01,3.02,3.04
113 | 20643,4,3.01,3.03,3.06
114 | 20644,4,3.02,3.03,3.05
115 | 26008,4,3.01,3.02,3.03
116 | 26009,4,3.02,3.03,3.04
117 | 26010,4,3.00,3.01,3.03
118 | 32767,4,3.01,3.03,3.05
119 | 32768,4,3.01,3.03,3.06
120 | 32769,4,3.01,3.04,3.05
121 | 41284,4,3.02,3.03,3.06
122 | 41285,4,3.03,3.05,3.07
123 | 41286,4,3.01,3.01,3.02
124 | 52015,4,3.15,3.16,3.17
125 | 52016,4,3.16,3.17,3.20
126 | 52017,4,3.15,3.17,3.18
127 | 65535,4,3.37,3.37,3.38
128 | 65536,4,3.37,3.37,3.37
129 | 65537,4,3.36,3.37,3.38
130 | 82569,4,3.37,3.37,3.38
131 | 82570,4,3.36,3.36,3.36
132 | 82571,4,3.36,3.38,3.39
133 | 104031,4,3.36,3.37,3.40
134 | 104032,4,3.37,3.37,3.38
135 | 104033,4,3.36,3.37,3.37
136 | 


--------------------------------------------------------------------------------
/data/firestorm/cache.txt:
--------------------------------------------------------------------------------
 1 | hw.perflevel0.l1icachesize: 196608
 2 | hw.perflevel0.l1dcachesize: 131072
 3 | hw.perflevel0.l2cachesize: 12582912
 4 | hw.perflevel1.l1icachesize: 131072
 5 | hw.perflevel1.l1dcachesize: 65536
 6 | hw.perflevel1.l2cachesize: 4194304
 7 | hw.cacheconfig: 8 1 4 0 0 0 0 0 0 0
 8 | hw.cachesize: 3708731392 65536 4194304 0 0 0 0 0 0 0
 9 | hw.cachelinesize: 128
10 | hw.l1icachesize: 131072
11 | hw.l1dcachesize: 65536
12 | hw.l2cachesize: 4194304
13 | 


--------------------------------------------------------------------------------
/data/firestorm/ras.csv:
--------------------------------------------------------------------------------
 1 | size,min,avg,max
 2 | 1,1.00,1.00,1.00
 3 | 2,0.50,0.84,1.00
 4 | 3,0.67,0.78,1.00
 5 | 4,0.75,0.75,0.75
 6 | 5,0.60,0.73,0.80
 7 | 6,0.67,0.73,0.83
 8 | 7,0.71,0.72,0.86
 9 | 8,0.62,0.96,4.38
10 | 9,0.67,0.74,3.44
11 | 10,0.70,0.74,1.90
12 | 11,0.64,0.70,0.73
13 | 12,0.67,0.70,0.75
14 | 13,0.69,0.70,0.77
15 | 14,0.64,0.86,15.36
16 | 15,0.67,0.70,0.73
17 | 16,0.69,0.70,0.75
18 | 17,0.65,0.70,0.76
19 | 18,0.67,0.69,0.72
20 | 19,0.68,0.70,1.26
21 | 20,0.70,1.06,1.85
22 | 21,0.67,0.85,2.76
23 | 22,0.68,0.73,2.09
24 | 23,0.65,0.71,1.22
25 | 24,0.67,0.71,1.08
26 | 25,0.68,1.14,4.76
27 | 26,0.69,1.44,14.19
28 | 27,0.67,0.98,3.96
29 | 28,0.68,1.06,5.14
30 | 29,0.66,0.74,2.21
31 | 30,0.67,1.05,2.20
32 | 31,0.68,1.30,6.94
33 | 32,1.06,1.13,4.84
34 | 33,0.67,6.56,34.64
35 | 34,0.71,2.37,8.91
36 | 35,0.74,5.89,12.63
37 | 36,0.78,3.04,16.00
38 | 37,0.81,6.41,8.59
39 | 38,0.84,3.00,10.87
40 | 39,0.87,0.89,1.15
41 | 40,0.90,2.23,8.12
42 | 41,0.93,0.97,1.59
43 | 42,0.95,7.05,95.43
44 | 43,0.98,1.03,1.79
45 | 44,1.00,4.55,48.61
46 | 45,1.02,10.14,122.67
47 | 46,1.04,1.06,1.07
48 | 47,1.06,1.08,1.09
49 | 48,1.08,4.71,34.50
50 | 49,1.10,5.24,76.29
51 | 50,1.12,5.01,50.88
52 | 51,5.57,9.63,65.25
53 | 52,5.48,10.23,95.85
54 | 53,5.40,6.67,36.87
55 | 54,5.31,6.87,55.11
56 | 55,5.24,8.34,27.25
57 | 56,5.16,7.96,38.00
58 | 57,5.11,26.98,348.40
59 | 58,5.03,7.27,56.45
60 | 59,4.97,6.83,53.93
61 | 60,4.90,6.46,45.75
62 | 61,4.85,5.53,9.25
63 | 62,4.77,10.00,47.16
64 | 63,4.71,17.70,202.35
65 | 64,4.66,8.36,51.64
66 | 


--------------------------------------------------------------------------------
/data/geekerwan/a14_a16_BV1gm4y157No.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu/6a8bf762ae0cfd29fe6eb5bad4a1434e06fb821c/data/geekerwan/a14_a16_BV1gm4y157No.png


--------------------------------------------------------------------------------
/data/geekerwan/a16_a17_BV1gm4y157No.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu/6a8bf762ae0cfd29fe6eb5bad4a1434e06fb821c/data/geekerwan/a16_a17_BV1gm4y157No.png


--------------------------------------------------------------------------------
/data/geekerwan/a17_BV1gm4y157No.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu/6a8bf762ae0cfd29fe6eb5bad4a1434e06fb821c/data/geekerwan/a17_BV1gm4y157No.png


--------------------------------------------------------------------------------
/data/geekerwan/a17_ecore_BV1gm4y157No.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu/6a8bf762ae0cfd29fe6eb5bad4a1434e06fb821c/data/geekerwan/a17_ecore_BV1gm4y157No.png


--------------------------------------------------------------------------------
/data/geekerwan/m2_m3_BV1NJ4m1w7zk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiegec/cpu/6a8bf762ae0cfd29fe6eb5bad4a1434e06fb821c/data/geekerwan/m2_m3_BV1NJ4m1w7zk.png


--------------------------------------------------------------------------------
/data/golden_cove/ras.csv:
--------------------------------------------------------------------------------
 1 | size,min,avg,max
 2 | 1,1.16,1.16,1.19
 3 | 2,1.07,1.07,1.09
 4 | 3,1.06,1.07,1.12
 5 | 4,1.04,1.04,1.09
 6 | 5,1.03,1.03,1.04
 7 | 6,1.19,1.19,1.20
 8 | 7,1.16,1.16,1.17
 9 | 8,1.14,1.14,1.16
10 | 9,1.13,1.13,1.13
11 | 10,1.11,1.11,1.12
12 | 11,1.10,1.10,1.11
13 | 12,1.09,1.10,1.10
14 | 13,1.09,1.10,1.88
15 | 14,1.15,1.17,1.59
16 | 15,1.14,1.15,1.56
17 | 16,1.13,1.13,1.15
18 | 17,1.13,1.13,1.49
19 | 18,1.12,1.13,1.49
20 | 19,1.11,1.11,1.11
21 | 20,1.11,1.11,1.11
22 | 21,2.20,2.20,2.20
23 | 22,2.05,2.20,2.30
24 | 23,2.05,2.30,3.21
25 | 24,2.31,2.31,2.31
26 | 25,2.28,2.40,2.45
27 | 26,2.35,2.40,2.50
28 | 27,2.41,2.43,2.64
29 | 28,2.47,2.48,2.61
30 | 29,2.52,2.56,2.77
31 | 30,2.57,2.57,2.66
32 | 31,2.62,2.63,2.88
33 | 32,2.66,2.72,3.69
34 | 33,2.86,2.86,2.86
35 | 34,2.76,2.88,2.95
36 | 35,2.89,2.92,2.95
37 | 36,2.81,2.85,3.06
38 | 37,2.84,2.84,2.91
39 | 38,2.87,2.88,3.36
40 | 39,3.08,3.18,3.18
41 | 40,2.93,3.03,3.35
42 | 41,2.97,2.99,3.15
43 | 42,2.98,3.02,3.07
44 | 43,3.01,3.11,3.46
45 | 44,3.03,3.03,3.08
46 | 45,3.07,3.07,3.07
47 | 46,3.07,3.13,3.46
48 | 47,3.09,3.15,3.32
49 | 48,3.31,3.32,3.32
50 | 49,3.14,3.20,3.67
51 | 50,3.14,3.16,3.34
52 | 51,3.16,3.31,3.38
53 | 52,3.18,3.20,3.37
54 | 53,3.23,3.23,3.26
55 | 54,3.22,3.23,3.34
56 | 55,3.23,3.26,3.59
57 | 56,3.23,3.26,3.56
58 | 57,3.25,3.33,3.44
59 | 58,3.26,3.26,3.33
60 | 59,3.28,3.30,3.54
61 | 60,3.29,3.29,3.35
62 | 61,3.31,3.39,3.46
63 | 62,3.31,3.31,3.37
64 | 63,3.33,3.46,5.05
65 | 64,3.33,3.35,3.55
66 | 


--------------------------------------------------------------------------------
/docs/3a6000.d2:
--------------------------------------------------------------------------------
  1 | cpu : Loongson 3A6000 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Chips and Cheese
  5 |       l1btb: 64-entry L1 BTB, 1 cycle latency
  6 | 
  7 |       # Source: Chips and Cheese
  8 |       ras: 16-entry RAS
  9 |     }
 10 | 
 11 |     l1ic: L1 IC {
 12 |       # Source: Chips and Cheese
 13 |       l1ic: 64KB 4-way L1 IC
 14 |     }
 15 | 
 16 |     bp -> l1ic
 17 | 
 18 |     iq: Instruction Queue
 19 |     l1ic -> iq
 20 | 
 21 |     # Source: Chips and Cheese
 22 |     decode: 6-way Decode
 23 |     iq -> decode
 24 | 
 25 |     # Source: Chips and Cheese
 26 |     rename: 6-way Rename
 27 |     decode -> rename
 28 |   }
 29 | 
 30 |   backend: Backend {
 31 |     # Source: Chips and Cheese
 32 |     rob: 256-entry ROB
 33 | 
 34 |     rf: Register File {
 35 |       # Source: Chips and Cheese
 36 |       irf: 192-entry Integer Register File
 37 | 
 38 |       # Source: Chips and Cheese
 39 |       vrf: 192-entry Vector Register File
 40 |     }
 41 | 
 42 |     # Source: Chips and Cheese
 43 |     sched1: 48-entry Integer Scheduler \#1
 44 | 
 45 |     # Source: Chips and Cheese
 46 |     pipe1: Pipe \#1 {
 47 |       ALU
 48 |       INT MUL
 49 |       Branch
 50 |     }
 51 |     rob -> sched1 -> rf -> pipe1
 52 | 
 53 |     # Source: Chips and Cheese
 54 |     pipe2: Pipe \#2 {
 55 |       ALU
 56 |       INT MUL
 57 |       Branch
 58 |     }
 59 |     rob -> sched1 -> rf -> pipe2
 60 | 
 61 |     # Source: Chips and Cheese
 62 |     pipe3: Pipe \#3 {
 63 |       ALU
 64 |     }
 65 |     rob -> sched1 -> rf -> pipe3
 66 | 
 67 |     # Source: Chips and Cheese
 68 |     pipe4: Pipe \#4 {
 69 |       ALU
 70 |     }
 71 |     rob -> sched1 -> rf -> pipe4
 72 | 
 73 |     # Source: Chips and Cheese
 74 |     sched2: 48-entry Mmeory Scheduler \#2
 75 | 
 76 |     # Source: Chips and Cheese
 77 |     pipe5: Pipe \#5 {
 78 |       Load AGU
 79 |     }
 80 |     rob -> sched2 -> rf -> pipe5
 81 | 
 82 |     # Source: Chips and Cheese
 83 |     pipe6: Pipe \#6 {
 84 |       Load AGU
 85 |     }
 86 |     rob -> sched2 -> rf -> pipe6
 87 | 
 88 |     # Source: Chips and Cheese
 89 |     pipe7: Pipe \#7 {
 90 |       Store AGU
 91 |     }
 92 |     rob -> sched2 -> rf -> pipe7
 93 | 
 94 |     # Source: Chips and Cheese
 95 |     pipe8: Pipe \#8 {
 96 |       Store AGU
 97 |     }
 98 |     rob -> sched2 -> rf -> pipe8
 99 | 
100 |     lsu: LSU {
101 |       # Source: Chips and Cheese
102 |       75-entry Load Queue
103 |       64-entry Store Queue
104 |     }
105 | 
106 |     pipe5 -> lsu
107 |     pipe6 -> lsu
108 |     pipe7 -> lsu
109 |     pipe8 -> lsu
110 | 
111 |     # Source: Chips and Cheese
112 |     sched3: 48-entry FP/Vector Scheduler \#3
113 | 
114 |     # Source: Chips and Cheese
115 |     pipe9: Pipe \#9 {
116 |       Vec ALU
117 |       Vec FADD
118 |       FMUL
119 |     }
120 |     rob -> sched3 -> rf -> pipe9
121 | 
122 |     # Source: Chips and Cheese
123 |     pipe10: Pipe \#10 {
124 |       Vec ALU
125 |       Vec FADD
126 |       FMUL
127 |     }
128 |     rob -> sched3 -> rf -> pipe10
129 | 
130 |     # Source: Chips and Cheese
131 |     pipe11: Pipe \#11 {
132 |       Vec ALU
133 |       Vec FADD
134 |       Vec FMUL
135 |       FADD
136 |     }
137 |     rob -> sched3 -> rf -> pipe11
138 | 
139 |     # Source: Chips and Cheese
140 |     pipe12: Pipe \#12 {
141 |       Vec ALU
142 |       Vec FADD
143 |       Vec FMUL
144 |       FADD
145 |     }
146 |     rob -> sched3 -> rf -> pipe12
147 |   }
148 |   frontend.rename -> backend.rob
149 | 
150 |   mem: Memory {
151 |     l1: L1 DC {
152 |       # Source: Chips and Cheese
153 |       l1dtlb: 64-entry L1 DTLB
154 | 
155 |       # Source: Chips and Cheese
156 |       l1dc: 64KB 4-way L1DC
157 |     }
158 | 
159 |     l2: L2 {
160 |       # Source: Chips and Cheese
161 |       l2dc: 256KB 4-way L2 Cache
162 |     }
163 |     l1 -> l2
164 |   }
165 |   frontend.l1ic -> mem.l2
166 |   backend.lsu -> mem.l1
167 | 
168 |   info: |md
169 |     Drawn by Jiajie Chen @jiegec
170 | 
171 |     Based on data from Chips and Cheese, Loongson
172 |   |
173 | }


--------------------------------------------------------------------------------
/docs/3a6000.md:
--------------------------------------------------------------------------------
1 | # Loongson 3A6000
2 | 
3 | ![](./3a6000.svg)
4 | 
5 | References:
6 | 
7 | - [Loongson 3A6000: A Star among Chinese CPUs](https://chipsandcheese.com/2024/03/13/loongson-3a6000-a-star-among-chinese-cpus/)
8 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | SRCS=$(wildcard *.d2)
2 | DSTS=$(patsubst %.d2,%.svg,$(SRCS))
3 | 
4 | all: $(DSTS)
5 | 
6 | %.svg: %.d2
7 | 	d2 $^ $@ --layout=elk --sketch=true


--------------------------------------------------------------------------------
/docs/ampere_one.d2:
--------------------------------------------------------------------------------
  1 | cpu : Ampere One CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Ampere
  5 |       l1btb: 256-entry L1 BTB, zero bubble
  6 | 
  7 |       # Source: Ampere
  8 |       l2btb: 8192-entry L2 BTB, two bubbles
  9 | 
 10 |       # Source: Ampere
 11 |       indir: indirect branch predictor
 12 | 
 13 |       # Source: Ampere
 14 |       tage: 8-table TAGE direction predictor
 15 | 
 16 |       # Source: Ampere
 17 |       latency: 10-cycle branch mispredict recovery
 18 |     }
 19 | 
 20 |     l1ic: L1 IC {
 21 |       # Source: Ampere
 22 |       l1itlb: 64-entry 4-way L1 ITLB
 23 | 
 24 |       # Source: Ampere
 25 |       l1ic: 16KB 4-way L1 IC
 26 |     }
 27 | 
 28 |     # Source: Ampere
 29 |     fq: 32-entry Fetch Queue
 30 |     bp -> fq
 31 |     fq -> l1ic
 32 | 
 33 |     # Source: Ampere
 34 |     iq: Instruction Queue
 35 |     l1ic -> iq: 5 inst/cycle
 36 | 
 37 |     # Source: Ampere
 38 |     decode: 5-way Decode
 39 |     iq -> decode
 40 | 
 41 |     # Source: Ampere
 42 |     rename: 4-way Rename {
 43 |       Macro Fusion
 44 |     }
 45 |     decode -> rename: 4 MOP/cycle
 46 |   }
 47 | 
 48 |   backend: Backend {
 49 |     # Source: Ampere
 50 |     rob: 208-entry ROB
 51 | 
 52 |     rf: Register File {
 53 |       # Source: Ampere
 54 |       irf: 166-entry Integer Register File
 55 | 
 56 |       # Source: Ampere
 57 |       vrf: 128-entry FP/Vector Register File
 58 |     }
 59 | 
 60 |     # Source: Ampere
 61 |     sched1: Integer Scheduler A0
 62 | 
 63 |     # Source: Ampere
 64 |     pipe1: Pipe \#1 {
 65 |       ALU
 66 |       Branch
 67 |       Flag
 68 |     }
 69 |     rob -> sched1 -> rf -> pipe1
 70 | 
 71 |     # Source: Ampere
 72 |     sched2: Integer Scheduler B0
 73 | 
 74 |     # Source: Ampere
 75 |     pipe2: Pipe \#2 {
 76 |       ALU
 77 |       Complex shift
 78 |     }
 79 |     rob -> sched2 -> rf -> pipe2
 80 | 
 81 |     # Source: Ampere
 82 |     sched3: Integer Scheduler B1
 83 | 
 84 |     # Source: Ampere
 85 |     pipe3: Pipe \#3 {
 86 |       Multicycle
 87 |     }
 88 |     rob -> sched2 -> rf -> pipe3
 89 |     rob -> sched3 -> rf -> pipe3
 90 | 
 91 |     # Source: Ampere
 92 |     pipe4: Pipe \#4 {
 93 |       ALU
 94 |       Complex shift
 95 |     }
 96 |     rob -> sched3 -> rf -> pipe4
 97 | 
 98 |     # Source: Ampere
 99 |     sched4: Integer Scheduler A1
100 | 
101 |     # Source: Ampere
102 |     pipe5: Pipe \#5 {
103 |       ALU
104 |       Branch
105 |       Flag
106 |     }
107 |     rob -> sched4 -> rf -> pipe5
108 | 
109 |     # Source: Ampere
110 |     sched5: FP/Vector Scheduler X
111 | 
112 |     # Source: Ampere
113 |     pipe6: Pipe \#6 {
114 |       Vector
115 |       FP
116 |     }
117 |     rob -> sched5 -> rf -> pipe6
118 | 
119 |     # Source: Ampere
120 |     sched6: FP/Vector Scheduler Y
121 | 
122 |     # Source: Ampere
123 |     pipe7: Pipe \#7 {
124 |       FP store data
125 |     }
126 |     rob -> sched5 -> rf -> pipe7
127 |     rob -> sched6 -> rf -> pipe7
128 | 
129 |     # Source: Ampere
130 |     pipe8: Pipe \#8 {
131 |       Vector
132 |       FP
133 |     }
134 |     rob -> sched6 -> rf -> pipe8
135 | 
136 |     # Source: Ampere
137 |     sched7: Memory Scheduler 0
138 | 
139 |     # Source: Ampere
140 |     pipe9: Pipe \#9 {
141 |       Load
142 |     }
143 |     rob -> sched7 -> rf -> pipe9
144 | 
145 |     # Source: Ampere
146 |     pipe10: Pipe \#10 {
147 |       Store
148 |     }
149 |     rob -> sched7 -> rf -> pipe10
150 | 
151 |     # Source: Ampere
152 |     sched8: Memory Scheduler 1
153 | 
154 |     # Source: Ampere
155 |     pipe11: Pipe \#11 {
156 |       Load
157 |     }
158 |     rob -> sched8 -> rf -> pipe11
159 | 
160 |     # Source: Ampere
161 |     pipe12: Pipe \#12 {
162 |       Store
163 |     }
164 |     rob -> sched8 -> rf -> pipe12
165 | 
166 |     lsu: LSU {
167 |       Load Queue
168 |       Store Queue
169 |     }
170 | 
171 |     pipe9 -> lsu
172 |     pipe10 -> lsu
173 |     pipe11 -> lsu
174 |     pipe12 -> lsu
175 |   }
176 |   frontend.rename -> backend.rob
177 | 
178 |   mem: Memory {
179 |     l1: L1 DC {
180 |       # Source: Ampere
181 |       l1dtlb: 64-entry fully-associative L1 DTLB
182 | 
183 |       # Source: Ampere
184 |       l1dc: 64KB 4-way L1 DC
185 | 
186 |       # Source: Ampere
187 |       l2itlb: 768-entry 6-way L2 ITLB
188 | 
189 |       # Source: Ampere
190 |       l2dtlb: 1536-entry 6-way L2 DTLB
191 | 
192 |       # Source: Ampere
193 |       4-cycle load to use
194 |       2x128bit load and 1x128bit write per cycle
195 |       8 page table walkers
196 |     }
197 | 
198 |     l2: L2 {
199 |       # Source: Ampere
200 |       2MB 8-way L2 Cache
201 |       11-cycle load to use latency
202 |     }
203 | 
204 |     l1 -> l2
205 | 
206 |     l3: L3 {
207 |       32MB L3 Cache
208 |     }
209 |     l2 -> l3
210 |   }
211 |   frontend.l1ic -> mem.l2
212 |   backend.lsu -> mem.l1
213 | 
214 |   info: |md
215 |     Drawn by Jiajie Chen @jiegec
216 | 
217 |     Based on data from Chips and Cheese, Ampere
218 |   |
219 | }


--------------------------------------------------------------------------------
/docs/ampere_one.md:
--------------------------------------------------------------------------------
1 | # Ampere One
2 | 
3 | ![](./ampere_one.svg)
4 | 
5 | References:
6 | 
7 | - [AmpereOne at Hot Chips 2024: Maximizing Density](https://chipsandcheese.com/2024/08/29/ampereone-at-hot-chips-2024-maximizing-density/)
8 | 


--------------------------------------------------------------------------------
/docs/avalanche.d2:
--------------------------------------------------------------------------------
  1 | cpu : Apple M2 Avalanche CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: JamesAslan
  5 |       l1btb: 1024-entry L1 BTB, 1 cycle latency
  6 | 
  7 |       # Source: JamesAslan
  8 |       l2btb: 3072-entry L2 BTB, 2 cycle latency
  9 | 
 10 |       # Source: JamesAslan
 11 |       l3btb: 192KB L1 IC as L3 BTB, 3 cycle latency
 12 |     }
 13 | 
 14 |     # Coupled Frontend
 15 |     l1ic: L1 IC {
 16 |       # Source: Geekerwan
 17 |       l1ic: 192KB L1 IC
 18 |     }
 19 | 
 20 |     bp -> l1ic
 21 | 
 22 |     # Source: Geekerwan
 23 |     decode: 8-way Decode
 24 |     l1ic -> decode
 25 |     decode -> bp
 26 | 
 27 |     # Source: Geekerwan
 28 |     rename: 8-way Rename
 29 |     decode -> rename
 30 |   }
 31 | 
 32 |   backend: Backend {
 33 |     # Source: Geekerwan
 34 |     rob: 274-entry Coalesced ROB
 35 | 
 36 |     rf: Register File {
 37 |       # Source: Geekerwan
 38 |       irf: ~350-entry Integer Register File
 39 | 
 40 |       # Source: Geekerwan
 41 |       vrf: ~380-entry 128b Vector Register File
 42 |     }
 43 | 
 44 |     # Source: Geekerwan
 45 |     dispatch1: 12-entry Dispatch Queue \#1
 46 | 
 47 |     # Source: Geekerwan
 48 |     sched1: 24-entry Scheduler \#1
 49 | 
 50 |     # Source: Geekerwan
 51 |     pipe1: Pipe \#1 {
 52 |       ALU
 53 |       BR
 54 |       FLAGS
 55 |       ADR
 56 |     }
 57 |     dispatch1 -> sched1 -> rf.irf -> pipe1
 58 | 
 59 |     # Source: Geekerwan
 60 |     sched2: 26-entry Scheduler \#2
 61 | 
 62 |     # Source: Geekerwan
 63 |     pipe2: Pipe \#2 {
 64 |       ALU
 65 |       BR
 66 |       FLAGS
 67 |       ADR
 68 |     }
 69 |     dispatch1 -> sched2 -> rf.irf -> pipe2
 70 | 
 71 |     # Source: Geekerwan
 72 |     sched3: 16-entry Scheduler \#3
 73 | 
 74 |     # Source: Geekerwan
 75 |     pipe3: Pipe \#3 {
 76 |       ALU
 77 |       FLAGS
 78 |     }
 79 |     dispatch1 -> sched3 -> rf.irf -> pipe3
 80 | 
 81 |     # Source: Geekerwan
 82 |     dispatch2: 12-entry Dispatch Queue \#2
 83 | 
 84 |     # Source: Geekerwan
 85 |     sched4: 12-entry Scheduler \#4
 86 | 
 87 |     # Source: Geekerwan
 88 |     pipe4: Pipe \#4 {
 89 |       ALU
 90 |     }
 91 |     dispatch2 -> sched4 -> rf.irf -> pipe4
 92 | 
 93 |     # Source: Geekerwan
 94 |     sched5: 28-entry Scheduler \#5
 95 | 
 96 |     # Source: Geekerwan
 97 |     pipe5: Pipe \#5 {
 98 |       ALU
 99 |       MUL
100 |       DIV
101 |     }
102 |     dispatch2 -> sched5 -> rf.irf -> pipe5
103 | 
104 |     # Source: Geekerwan
105 |     sched6: 28-entry Scheduler \#6
106 | 
107 |     # Source: Geekerwan
108 |     pipe6: Pipe \#6 {
109 |       ALU
110 |       MUL
111 |       BFM
112 |       MADD
113 |     }
114 |     dispatch2 -> sched6 -> rf.irf -> pipe6
115 | 
116 |     # Source: Geekerwan
117 |     dispatch3: 10-entry Dispatch Queue \#3
118 | 
119 |     # Source: Geekerwan
120 |     sched7: 52-entry Scheduler \#7
121 | 
122 |     # Source: Geekerwan
123 |     pipe7: Pipe \#7 {
124 |       STORE
125 |     }
126 |     dispatch3 -> sched7 -> rf.irf -> pipe7
127 | 
128 |     # Source: Geekerwan
129 |     pipe8: Pipe \#8 {
130 |       LOAD
131 |       STORE
132 |     }
133 |     dispatch3 -> sched7 -> rf.irf -> pipe8
134 | 
135 |     # Source: Geekerwan
136 |     pipe9: Pipe \#9 {
137 |       LOAD
138 |     }
139 |     dispatch3 -> sched7 -> rf.irf -> pipe9
140 | 
141 |     # Source: Geekerwan
142 |     pipe10: Pipe \#10 {
143 |       LOAD
144 |     }
145 |     dispatch3 -> sched7 -> rf.irf -> pipe10
146 | 
147 |     lsu: LSU {
148 |       # Source: Geekerwan
149 |       128-entry Load Queue
150 |       60-entry Store Queue
151 |     }
152 | 
153 |     pipe7 -> lsu
154 |     pipe8 -> lsu
155 |     pipe9 -> lsu
156 |     pipe10 -> lsu
157 | 
158 |     rob -> dispatch1
159 |     rob -> dispatch2
160 |     rob -> dispatch3
161 | 
162 |     # Source: Geekerwan
163 |     dispatch4: 12-entry Dispatch Queue \#4
164 | 
165 |     # Source: Geekerwan
166 |     sched8: 36-entry Scheduler \#8
167 | 
168 |     # Source: Geekerwan
169 |     pipe11: Pipe \#11 {
170 |       FP
171 |       SIMD
172 |     }
173 |     dispatch4 -> sched8 -> rf.vrf -> pipe11
174 | 
175 |     # Source: Geekerwan
176 |     sched9: 36-entry Scheduler \#9
177 | 
178 |     # Source: Geekerwan
179 |     pipe12: Pipe \#12 {
180 |       FP
181 |       SIMD
182 |     }
183 |     dispatch4 -> sched9 -> rf.vrf -> pipe12
184 | 
185 |     # Source: Geekerwan
186 |     sched10: 36-entry Scheduler \#10
187 | 
188 |     # Source: Geekerwan
189 |     pipe13: Pipe \#13 {
190 |       FP
191 |       SIMD
192 |       TO INT
193 |     }
194 |     dispatch4 -> sched10 -> rf.vrf -> pipe13
195 | 
196 |     # Source: Geekerwan
197 |     sched11: 36-entry Scheduler \#11
198 | 
199 |     # Source: Geekerwan
200 |     pipe14: Pipe \#14 {
201 |       FP
202 |       SIMD
203 |       FSQRT
204 |       FCSEL
205 |       TO INT
206 |     }
207 |     dispatch4 -> sched11 -> rf.vrf -> pipe14
208 |     rob -> dispatch4
209 |   }
210 |   frontend.rename -> backend.rob
211 | 
212 |   mem: Memory {
213 |     l1: L1 DC {
214 |       # Source: Geekerwan
215 |       l1dc: 128KB L1DC
216 |     }
217 |   }
218 |   backend.lsu -> mem.l1
219 | 
220 |   info: |md
221 |     Drawn by Jiajie Chen @jiegec
222 | 
223 |     Based on data from Geekerwan
224 |   |
225 | }


--------------------------------------------------------------------------------
/docs/avalanche.md:
--------------------------------------------------------------------------------
 1 | # Apple M2 P-core aka Avalanche
 2 | 
 3 | ![](./avalanche.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Apple M2 Blizzard 微架构评测(上)：阳春白雪](https://zhuanlan.zhihu.com/p/675322260)
 8 | - [Apple M2 Blizzard 微架构评测(中)：阳春白雪](https://zhuanlan.zhihu.com/p/678983061)
 9 | - [不为人知的角落，Apple M2 的小小努力（其一）](https://zhuanlan.zhihu.com/p/662561990)
10 | - [苹果 M4 性能分析：尽力了，但芯片工艺快到头了！](https://www.bilibili.com/video/BV1NJ4m1w7zk/)
11 | 


--------------------------------------------------------------------------------
/docs/cbp.md:
--------------------------------------------------------------------------------
  1 | # Reverse Engineered Conditional Branch Predictors
  2 | 
  3 | Reverse engineered conditional branch predictors, using the methodology from the following papers:
  4 | 
  5 | - Half&Half: Demystifying Intel’s Directional Branch Predictors for Fast, Secure Partitioned Execution
  6 | - Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon for Software Optimization and Architectural Analysis
  7 | 
  8 | Code can be found at [jiegec/cpu-micro-benchmarks](https://github.com/jiegec/cpu-micro-benchmarks).
  9 | 
 10 | Glossary:
 11 | 
 12 | - B: the branch address; on ARM64, it is the address of the first byte of the instruction; on AMD64, it is the address of the last byte of the instruction
 13 | - T: the target address
 14 | - PHR: Path History Register
 15 | - PHT: Pattern History Table
 16 | - footprint: how many bits are xor-ed into PHR for each taken branch
 17 | 
 18 | Overview:
 19 | 
 20 | | uArch               | PHR in bits          | T bits  | B bits  | footprint bits |
 21 | |---------------------|----------------------|---------|---------|----------------|
 22 | | Qualcomm Oryon      | `100*1 + 32*1 = 132` | T[31:2] | B[5:2]  | 30 + 4         |
 23 | | Apple Firestorm     | `100*1 + 28*1 = 128` | T[31:2] | B[5:2]  | 30 + 4         |
 24 | | Apple Icestorm      | `60*1 + 16*1 = 76`   | T[47:2] | B[5:2]  | 46 + 4         |
 25 | | ARM Neoverse V1     | `64*3 = 192`         | T[7:2]  | B[14:2] | 3              |
 26 | | ARM Neoverse N1     | `48*3 = 144`         | T[7:2]  | B[8:2]  | 3              |
 27 | | Intel Sunny Cove    | `194*2 = 388`        | T[5:0]  | B[15:0] | 16             |
 28 | | Intel Golden Cove   | `194*2 = 388`        | T[5:0]  | B[15:0] | 16             |
 29 | | Intel Raptor Cove   | `194*2 = 388`        | T[5:0]  | B[15:0] | 16             |
 30 | | Intel Redwood Cove  | `194*2 = 388`        | T[5:0]  | B[15:0] | 16             |
 31 | | Intel Cascade Lake  | `93*2 = 186`         | T[5:0]  | B[18:3] | 16             |
 32 | | Intel Skylake       | `93*2 = 186`         | T[5:0]  | B[18:3] | 16             |
 33 | | Intel Haswell       | `93*2 = 186`         | T[5:0]  | B[19:4] | 16             |
 34 | | Intel Ivy Bridge    | `93*2 = 186`         | T[5:0]  | B[19:4] | 16             |
 35 | 
 36 | ## Qualcomm Oryon
 37 | 
 38 | - PHRT: 100 bits
 39 | - PHRB: 32 bits
 40 | - PHRT is updated upon taken branch: `PHRTnew = (PHRTold << 2) xor T[31:2]`
 41 | - PHRB is updated upon taken branch: `PHRBnew = (PHRBold << 2) xor B[5:2]`
 42 | - PHT: 6 tables, see [Result of Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon](https://github.com/jiegec/cpu-micro-benchmarks/blob/master/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/README.md)
 43 | - Source: Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon for Software Optimization and Architectural Analysis
 44 | 
 45 | ## ARM Neoverse V1
 46 | 
 47 | - PHR: `64*3=192` bits
 48 | - PHR is updated upon taken branch: `PHRnew = (PHRold << 3) xor footprint`
 49 | - footprint has 3 bits:
 50 |     - footprint[0] = T[2] xor T[5] xor B[3] xor B[6] xor B[9] xor B[12]
 51 |     - footprint[1] = T[3] xor T[6] xor B[4] xor B[7] xor B[10] xor B[13]
 52 |     - footprint[2] = T[4] xor T[7] xor B[5] xor B[8] xor B[11] xor B[14]
 53 | - Source: [Jiajie Chen](https://github.com/jiegec)
 54 | 
 55 | ## ARM Neoverse N1
 56 | 
 57 | - PHR: `48*3=144` bits
 58 | - PHR is updated upon taken branch: `PHRnew = (PHRold << 3) xor footprint`
 59 | - footprint has 3 bits:
 60 |     - footprint[0] = T[2] xor T[5] xor B[3] xor B[6]
 61 |     - footprint[1] = T[3] xor T[6] xor B[4] xor B[7]
 62 |     - footprint[2] = T[4] xor T[7] xor B[5] xor B[8]
 63 | - Source: [Jiajie Chen](https://github.com/jiegec)
 64 | 
 65 | ## Apple Firestorm
 66 | 
 67 | - PHRT: 100 bits
 68 | - PHRB: 28 bits
 69 | - PHRT is updated upon taken branch: `PHRTnew = (PHRTold << 2) xor T[31:2]`
 70 | - PHRB is updated upon taken branch: `PHRBnew = (PHRBold << 2) xor B[5:2]`
 71 | - PHT: 6 tables, see [Result of Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon](https://github.com/jiegec/cpu-micro-benchmarks/blob/master/reports/dissecting_cbp_of_apple_firestorm_and_qualcomm_oryon/README.md)
 72 | - Source: Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon for Software Optimization and Architectural Analysis
 73 | 
 74 | ## Apple Icestorm
 75 | 
 76 | - PHRT: 60 bits
 77 | - PHRB: 16 bits
 78 | - PHRT is updated upon taken branch: `PHRTnew = (PHRTold << 2) xor T[47:2]`
 79 | - PHRB is updated upon taken branch: `PHRBnew = (PHRBold << 2) xor B[5:2]`
 80 | - Source: [Jiajie Chen](https://github.com/jiegec)
 81 | 
 82 | ## Intel Haswell/Ivy Bridge
 83 | 
 84 | - PHR: `93*2=186` bits
 85 | - PHR is updated upon taken branch: `PHRnew = (PHRold << 2) xor footprint`
 86 | - footprint has 16 bits:
 87 |     - footprint[0] = B[6] xor T[0]
 88 |     - footprint[1] = B[7] xor T[1]
 89 |     - footprint[2] = B[10] xor T[2]
 90 |     - footprint[3] = B[11] xor T[3]
 91 |     - footprint[4] = B[14] xor T[4]
 92 |     - footprint[5] = B[15] xor T[5]
 93 |     - footprint[6] = B[4]
 94 |     - footprint[7] = B[5]
 95 |     - footprint[8] = B[8]
 96 |     - footprint[9] = B[9]
 97 |     - footprint[10] = B[12]
 98 |     - footprint[11] = B[13]
 99 |     - footprint[12] = B[16]
100 |     - footprint[13] = B[17]
101 |     - footprint[14] = B[18]
102 |     - footprint[15] = B[19]
103 | - PHT:
104 |     - 3 tables
105 |     - each table is 4-way associative
106 |     - each table has 9 index bits, including PC[4]
107 |     - each table has `4*2^9=2048` entries
108 | - Source: Half&Half: Demystifying Intel’s Directional Branch Predictors for Fast, Secure Partitioned Execution (There is a typo in `B13 xor T5`, which should be `B15 xor T5`)
109 | - Reproduced by [Jiajie Chen](https://github.com/jiegec)
110 | 
111 | ## Intel Cascade Lake/Skylake
112 | 
113 | - PHR: `93*2=186` bits
114 | - PHR is updated upon taken branch: `PHRnew = (PHRold << 2) xor footprint`
115 | - footprint has 16 bits:
116 |     - footprint[0] = B[3] xor T[0]
117 |     - footprint[1] = B[4] xor T[1]
118 |     - footprint[2] = B[7] xor T[2]
119 |     - footprint[3] = B[8] xor T[3]
120 |     - footprint[4] = B[11] xor T[4]
121 |     - footprint[5] = B[12] xor T[5]
122 |     - footprint[6] = B[5]
123 |     - footprint[7] = B[6]
124 |     - footprint[8] = B[9]
125 |     - footprint[9] = B[10]
126 |     - footprint[10] = B[13]
127 |     - footprint[11] = B[14]
128 |     - footprint[12] = B[15]
129 |     - footprint[13] = B[16]
130 |     - footprint[14] = B[17]
131 |     - footprint[15] = B[18]
132 | - PHT:
133 |     - 3 tables
134 |     - history length of the 3 tables: 22, 58, 186
135 |     - each table is 4-way associative
136 |     - each table has 9 index bits, including PC[5]
137 |     - each table has `4*2^9=2048` entries
138 | - Source: Half&Half: Demystifying Intel’s Directional Branch Predictors for Fast, Secure Partitioned Execution
139 | - Reproduced by [Jiajie Chen](https://github.com/jiegec)
140 | 
141 | ## Intel Sunny Cove/Golden Cove/Raptor Cove/Redwood Cove
142 | 
143 | - PHR: `194*2=388` bits
144 | - PHR is updated upon taken branch: `PHRnew = (PHRold << 2) xor footprint`
145 | - footprint has 16 bits:
146 |     - footprint[0] = B[3] xor T[0]
147 |     - footprint[1] = B[4] xor T[1]
148 |     - footprint[2] = B[5]
149 |     - footprint[3] = B[6]
150 |     - footprint[4] = B[7]
151 |     - footprint[5] = B[8]
152 |     - footprint[6] = B[9]
153 |     - footprint[7] = B[10]
154 |     - footprint[8] = B[0] xor T[2]
155 |     - footprint[9] = B[1] xor T[3]
156 |     - footprint[10] = B[2] xor T[4]
157 |     - footprint[11] = B[11] xor T[5]
158 |     - footprint[12] = B[12]
159 |     - footprint[13] = B[13]
160 |     - footprint[14] = B[14]
161 |     - footprint[15] = B[15]
162 | - PHT:
163 |     - 4 tables
164 |     - history length of the 4 tables: 36, 68, 132, 388
165 |     - each table is 4-way associative
166 |     - each table has 9 index bits, including PC[5]
167 |     - each table has `4*2^9=2048` entries
168 | - Source: Half&Half: Demystifying Intel’s Directional Branch Predictors for Fast, Secure Partitioned Execution ([Jiajie Chen](https://github.com/jiegec) leads to a different conclusion regarding the PHT tables from the paper)
169 | - Reproduced by [Jiajie Chen](https://github.com/jiegec)
170 | 


--------------------------------------------------------------------------------
/docs/comparison.md:
--------------------------------------------------------------------------------
 1 | # Comparisons across microarchitectures
 2 | 
 3 | ## Frontend
 4 | 
 5 | ### Branch Prediction
 6 | 
 7 | {{ bp_comparison() }}
 8 | 
 9 | ### L1 ICache + ITLB
10 | 
11 | {{ l1ic_comparison() }}
12 | 
13 | ### Move Elimination / Zeroing Idiom / Ones Idiom
14 | 
15 | | Pattern\uArch              | Oryon   | Firestorm | Golden Cove | Cortex X1 | Zen 3   | Sunny Cove | Zen 1-2 |
16 | |----------------------------|---------|-----------|-------------|-----------|---------|------------|---------|
17 | | # ALU                      | 6       | 6         | 5           | 4         | 4       | 4          | 4       |
18 | | # Dispatch                 | 8       | 8         | 6           | 8         | 6       | 5          | 5       |
19 | | Dep int add                | 1.0     | 1.0       | 1.0         | 1.0       | 1.0     | 1.0        | 1.0     |
20 | | Indep int add              | 6.0     | 3.9       | 4.7         | 4.0       | 4.0     | 4.0        | 4.0     |
21 | | Dep int mov                | 1.2     | 1.2       | **5.5**     | 1.3       | **6.0** | **4.6**    | **5.0** |
22 | | Indep int mov              | **8.0** | **8.0**   | **5.4**     | 4.0       | **6.0** | **4.6**    | **5.0** |
23 | | Dep zero via xor           | 1.0     | 1.0       | **5.5**     | 1.0       | **6.0** | **4.6**    | *4.0*   |
24 | | Dep zero via sub           | 1.0     | 1.0       | **6.0**     | 1.0       | **6.0** | **4.6**    | *4.0*   |
25 | | Indep set zero via mov     | 6.0     | **8.0**   | **6.0**     | **6.0**   | 4.0     | 3.7        | 4.0     |
26 | | Indep set one via mov      | 6.0     | **7.8**   | **6.0**     | 4.0       | 4.0     | 4.0        | 4.0     |
27 | | Indep set two via mov      | 6.0     | **7.8**   | **6.0**     | 4.0       | 4.0     | 4.0        | 4.0     |
28 | | Indep set 1024 via mov     | 6.0     | **7.8**   | 5.0         | 4.0       | 4.0     | 4.0        | 4.0     |
29 | | Vec dep mov                | 0.6     | 0.6       | **6.0**     | 0.5       | **6.0** | 1.0        | 4.0     |
30 | | Vec indep mov              | **8.0** | **8.0**   | **6.0**     | 4.0       | **6.0** | 3.0        | 4.0     |
31 | | Vec dep set zero via xor   | 0.5     | 0.5       | **6.0**     | 0.5       | *4.0*   | **5.0**    | *4.0*   |
32 | | Vec dep set zero via sub   | 0.5     | 0.5       | 0.5         | 0.5       | 0.3     | 0.25       | 0.3     |
33 | | Vec indep set zero via mov | 4.0     | **8.0**   | N/A         | **6.0**   | N/A     | N/A        | N/A     |
34 | | Nop                        | **8.0** | **8.0**   | **5.7**     | **8.0**   | **6.0** | 4.0        | **5.0** |
35 | 
36 | - **Bold**: Not executed by ALU/FPU, eliminated at rename stage
37 | - *Italics*: Executed by ALU/FPU, but source register dependency was removed so that dependent ops can be executed in parallel
38 | - Although Cortex-X1 has 8 dispatch width, but it has many limitations on instruction type
39 | 
40 | ## Backend
41 | 
42 | ### ROB
43 | 
44 | {{ rob_comparison() }}
45 | 
46 | ### LSU
47 | 
48 | | uArch       | 64b Load | 64b Store | 128b Load | 128b Store | 256b Load | 256b Store |
49 | |-------------|----------|-----------|-----------|------------|-----------|------------|
50 | | Zen2        | 2/cycle  | 1/cycle   | 2/cycle   | 1/cycle    | 2/cycle   | 1/cycle    |
51 | | Zen4        | 3/cycle  | 2/cycle   | 2/cycle   | 1/cycle    | 2/cycle   | 1/cycle    |
52 | | Golden Cove | 3/cycle  | 2/cycle   | 3/cycle   | 2/cycle    | 3/cycle   | 2/cycle    |
53 | | Firestorm   | 3/cycle  | 2/cycle   | 3/cycle   | 2/cycle    | 1.5/cycle | 1/cycle    |
54 | | Oryon       | 4/cycle  | 2/cycle   | 4/cycle   | 2/cycle    | 2/cycle   | 1/cycle    |
55 | 
56 | ### Execution Unit
57 | 
58 | {{ eu_comparison() }}
59 | 
60 | ## Comparison between microarchitectures
61 | 
62 | ### Firestorm vs Oryon
63 | 
64 | {{ firestorm_oryon_comparison() }}
65 | 
66 | ### Cortex-X series
67 | 
68 | {{ cortex_x_comparison() }}
69 | 
70 | 


--------------------------------------------------------------------------------
/docs/cortex_a75.d2:
--------------------------------------------------------------------------------
  1 | cpu : ARM Cortex-A75 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Chips and Cheese
  5 |       ras: RAS
  6 | 
  7 |       # Source: Chips and Cheese
  8 |       l0btb: 48-entry L0 BTB
  9 | 
 10 |       # Source: Chips and Cheese
 11 |       l1btb: 3072-entry L0 BTB
 12 |     }
 13 | 
 14 |     l1ic: L1 IC {
 15 |       # Source: Chips and Cheese
 16 |       l1ic: 64KB 4-way L1 IC
 17 | 
 18 |       # Source: Chips and Cheese
 19 |       l1itlb: 32-entry L1 ITLB
 20 |     }
 21 | 
 22 |     bp -> l1ic
 23 | 
 24 |     # Source: Chips and Cheese
 25 |     decode: 3-way Decode
 26 |     l1ic -> decode: 12 bytes/cycle
 27 |     decode -> bp
 28 | 
 29 |     # Source: Chips and Cheese
 30 |     rename: 3-way Rename
 31 |     decode -> rename
 32 |   }
 33 | 
 34 |   backend: Backend {
 35 |     # Source: Chips and Cheese
 36 |     rob: 73-entry ROB
 37 | 
 38 |     rf: Register File {
 39 |       # Source: Chips and Cheese
 40 |       irf: 101-entry Integer Register File
 41 | 
 42 |       # Source: Chips and Cheese
 43 |       frf: 89-entry FP Register File
 44 |     }
 45 | 
 46 |     # Source: Chips and Cheese
 47 |     sched1: Scheduler \#1
 48 | 
 49 |     # Source: Chips and Cheese
 50 |     pipe1: Pipe \#1 {
 51 |       ALU
 52 |     }
 53 |     rob -> sched1 -> rf -> pipe1
 54 | 
 55 |     # Source: Chips and Cheese
 56 |     sched2: Scheduler \#2
 57 | 
 58 |     # Source: Chips and Cheese
 59 |     pipe2: Pipe \#2 {
 60 |       ALU
 61 |       INT MUL
 62 |     }
 63 |     rob -> sched2 -> rf -> pipe2
 64 | 
 65 |     # Source: Chips and Cheese
 66 |     sched3: Scheduler \#3
 67 | 
 68 |     # Source: Chips and Cheese
 69 |     pipe3: Pipe \#3 {
 70 |       Branch
 71 |     }
 72 |     rob -> sched3 -> rf -> pipe3
 73 | 
 74 |     # Source: Chips and Cheese
 75 |     sched4: Scheduler \#4
 76 | 
 77 |     # Source: Chips and Cheese
 78 |     pipe4: Pipe \#4 {
 79 |       AGU
 80 |     }
 81 |     rob -> sched4 -> rf -> pipe4
 82 | 
 83 |     # Source: Chips and Cheese
 84 |     sched5: Scheduler \#5
 85 | 
 86 |     # Source: Chips and Cheese
 87 |     pipe5: Pipe \#5 {
 88 |       AGU
 89 |     }
 90 |     rob -> sched5 -> rf -> pipe5
 91 | 
 92 |     # Source: Chips and Cheese
 93 |     sched6: Scheduler \#6
 94 | 
 95 |     # Source: Chips and Cheese
 96 |     pipe6: Pipe \#6 {
 97 |       FMA
 98 |       128b ALU
 99 |       AES
100 |     }
101 |     rob -> sched6 -> rf -> pipe6
102 | 
103 |     # Source: Chips and Cheese
104 |     sched7: Scheduler \#7
105 | 
106 |     # Source: Chips and Cheese
107 |     pipe7: Pipe \#7 {
108 |       FMA
109 |       128b ALU
110 |     }
111 |     rob -> sched7 -> rf -> pipe7
112 | 
113 |     lsu: LSU {
114 |       # Source: Chips and Cheese
115 |       69-entry Load Queue
116 |       14-entry Store Queue
117 |     }
118 | 
119 |     pipe4 -> lsu
120 |     pipe5 -> lsu
121 |   }
122 |   frontend.rename -> backend.rob
123 | 
124 |   mem: Memory {
125 |     l1: L1 DC {
126 |       # Source: Chips and Cheese
127 |       l1dc: 64KB 4-way L1DC
128 |       l1dtlb: 48-entry L1 DTLB
129 |       l2tlb: 1024-entry 4-way L2 TLB
130 |     }
131 | 
132 |     # Source: Chips and Cheese
133 |     l2: L2 DC
134 |     l1 -> l2
135 |   }
136 |   backend.lsu -> mem.l1
137 | 
138 |   info: |md
139 |     Drawn by Jiajie Chen @jiegec
140 | 
141 |     Based on data from Chips and Cheese
142 |   |
143 | }


--------------------------------------------------------------------------------
/docs/cortex_a75.md:
--------------------------------------------------------------------------------
1 | # ARM Cortex A75
2 | 
3 | ![](./cortex_a75.svg)
4 | 
5 | References:
6 | 
7 | - [Inside SiFive’s P550 Microarchitecture](https://chipsandcheese.com/p/inside-sifives-p550-microarchitecture)
8 | 


--------------------------------------------------------------------------------
/docs/cortex_a77.d2:
--------------------------------------------------------------------------------
  1 | cpu : ARM Cortex-A77 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: ARM
  5 |       # "4x larger L1-BTB (64-entry, 1-cycle latency)"
  6 |       l1btb: 64-entry L2 BTB, 0 bubble
  7 | 
  8 |       # Source: ARM
  9 |       # "33% larger main BTB (8K entry, better performance for real workloads)"
 10 |       l2btb: 8K-entry L2 BTB
 11 |     }
 12 | 
 13 |     l1ic: L1 IC {
 14 |       # Source: ARM
 15 |       l1ic: 64KB 4-way L1 IC
 16 | 
 17 |       # Source: ARM
 18 |       l1itlb: 48-entry fully associative L1 ITLB
 19 |     }
 20 | 
 21 |     fq: Fetch Queue
 22 |     bp -> fq
 23 |     fq -> l1ic
 24 | 
 25 |     iq: Instruction Queue
 26 |     l1ic -> iq
 27 | 
 28 |     # Source: ARM
 29 |     # "6 Instrs/cycle"
 30 |     decode: 6-way Decode
 31 |     iq -> decode
 32 | 
 33 |     # Source: ARM
 34 |     mopc: 1.5K MOP Cache
 35 |     decode -> mopc
 36 |     bp -> mopc
 37 | 
 38 |     mop: MOP Queue
 39 | 
 40 |     # Source: ARM
 41 |     mopc -> mop
 42 |     decode -> mop
 43 | 
 44 |     # Source: ARM
 45 |     rename: 6-way Rename
 46 |     mop -> rename
 47 |   }
 48 | 
 49 |   backend: Backend {
 50 |     # Source: ARM
 51 |     # "160 entry instruction window"
 52 |     rob: 160-entry ROB
 53 | 
 54 |     rf: Register File {
 55 |       irf: Integer Register File
 56 | 
 57 |       flagsrf: Flags Register File
 58 | 
 59 |       vrf: FP/Vector Register File
 60 |     }
 61 | 
 62 |     # Source: ARM
 63 |     sched1: ALU Scheduler
 64 | 
 65 |     # Source: ARM
 66 |     pipe1: Pipe \#1 {
 67 |       Branch
 68 |     }
 69 |     rob -> sched1 -> rf -> pipe1
 70 | 
 71 |     # Source: ARM
 72 |     pipe2: Pipe \#2 {
 73 |       Branch
 74 |     }
 75 |     rob -> sched1 -> rf -> pipe2
 76 | 
 77 |     # Source: ARM
 78 |     pipe3: Pipe \#3 {
 79 |       ALU
 80 |     }
 81 |     rob -> sched1 -> rf -> pipe3
 82 | 
 83 |     # Source: ARM
 84 |     pipe4: Pipe \#4 {
 85 |       ALU
 86 |     }
 87 |     rob -> sched1 -> rf -> pipe4
 88 | 
 89 |     # Source: ARM
 90 |     pipe5: Pipe \#5 {
 91 |       ALU
 92 |     }
 93 |     rob -> sched1 -> rf -> pipe5
 94 | 
 95 |     # Source: ARM
 96 |     pipe6: Pipe \#6 {
 97 |       ALU
 98 |       MAC
 99 |       DIV
100 |     }
101 |     rob -> sched1 -> rf -> pipe6
102 | 
103 |     # Source: ARM
104 |     sched2: FP/Vector Scheduler
105 | 
106 |     # Source: ARM
107 |     pipe7: Pipe \#7 {
108 |       FMUL
109 |       FADD
110 |       FDIV
111 |       Vec ALU
112 |       IMAC
113 |     }
114 |     rob -> sched2 -> rf -> pipe7
115 | 
116 |     # Source: ARM
117 |     pipe8: Pipe \#8 {
118 |       FMUL
119 |       FADD
120 |       Vec ALU
121 |     }
122 |     rob -> sched2 -> rf -> pipe8
123 | 
124 |     # Source: ARM
125 |     sched3: Memory Scheduler
126 | 
127 |     # Source: ARM
128 |     pipe9: Pipe \#9 {
129 |       Store Data
130 |     }
131 |     rob -> sched3 -> rf -> pipe9
132 | 
133 |     # Source: ARM
134 |     pipe10: Pipe \#10 {
135 |       Store Data
136 |     }
137 |     rob -> sched3 -> rf -> pipe10
138 | 
139 |     # Source: ARM
140 |     pipe11: Pipe \#11 {
141 |       Load AGU
142 |       Store AGU
143 |     }
144 |     rob -> sched3 -> rf -> pipe11
145 | 
146 |     # Source: ARM
147 |     pipe12: Pipe \#12 {
148 |       Load AGU
149 |       Store AGU
150 |     }
151 |     rob -> sched3 -> rf -> pipe12
152 | 
153 |     lsu: LSU {
154 |       Load Queue
155 |       Store Queue
156 |       # Source: ARM Software Optimization Guide
157 |       4 load to use integer latency
158 |       5 load to use fp latency
159 |       2 load/cycle
160 |       2 store/cycle
161 |     }
162 | 
163 |     pipe9 -> lsu
164 |     pipe10 -> lsu
165 |     pipe11 -> lsu
166 |     pipe12 -> lsu
167 |   }
168 |   frontend.rename -> backend.rob
169 | 
170 |   mem: Memory {
171 |     l1: L1 DC {
172 |       # Source: ARM
173 |       l1dc: 64KB 4-way L1 DC
174 | 
175 |       # Source: ARM
176 |       l1dtlb: 48-entry fully associative L1 DTLB
177 |     }
178 | 
179 |     l2: L2 {
180 |       # Source: ARM
181 |       128KB/256KB/512KB 8-way L2 Cache
182 | 
183 |       # Source: ARM
184 |       l2tlb: 1280-entry 5-way L2 TLB
185 |     }
186 | 
187 |     l1 -> l2
188 | 
189 |     l3: L3 {
190 |       # Source: ARM
191 |       512KB-4MB shared L3 cache
192 |     }
193 |     l2 -> l3
194 |   }
195 |   frontend.l1ic -> mem.l2
196 |   backend.lsu -> mem.l1
197 | 
198 |   info: |md
199 |     Drawn by Jiajie Chen @jiegec
200 | 
201 |     Based on data from ARM and Anandtech
202 |   |
203 | }


--------------------------------------------------------------------------------
/docs/cortex_a77.md:
--------------------------------------------------------------------------------
 1 | # ARM Cortex A77
 2 | 
 3 | ![](./cortex_a77.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Arm's New Cortex-A77 CPU Micro-architecture: Evolving Performance](https://www.anandtech.com/show/14384/arm-announces-cortexa77-cpu-ip)
 8 | - [Arm® Cortex®‑A77 Core Technical Reference Manual](https://developer.arm.com/documentation/101111/0101)
 9 | - [Arm Cortex-A77 Core Software Optimization Guide](https://developer.arm.com/documentation/swog011050/latest/)
10 | 


--------------------------------------------------------------------------------
/docs/cortex_x1.d2:
--------------------------------------------------------------------------------
  1 | cpu : ARM Cortex-X1 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: ARM, JamesAslan
  5 |       # "50% larger L0-BTB capacity, 96 entries (zero-cycle bubble taken-branch latency)"
  6 |       l1btb: 96-entry L1 BTB, zero bubble, two taken branches per cycle
  7 | 
  8 |       # Source: JamesAslan
  9 |       l2btb: 8192-entry L2 BTB
 10 | 
 11 |       # Source: JamesAslan
 12 |       ras: 16-entry RAS
 13 |     }
 14 | 
 15 |     l1ic: L1 IC {
 16 |       # Source: JamesAslan
 17 |       # Source: ARM
 18 |       l1itlb: 48-entry fully-associative L1 ITLB
 19 | 
 20 |       # Source: ARM
 21 |       l1ic: 4-way 64KB L1 IC
 22 |     }
 23 | 
 24 |     fq: Fetch Queue
 25 |     bp -> fq
 26 |     fq -> l1ic
 27 | 
 28 |     iq: Instruction Queue
 29 |     # "5 instruction fetch from the instruction cache"
 30 |     # "Instructions are first fetched and then decoded into internal
 31 |     # Macro-OPerations (MOPs). From there, the MOPs proceed through register
 32 |     # renaming and dispatch stages. A MOP can be split into two Micro-
 33 |     # OPerations (µOPs) further down the pipeline after the decode stage. Once
 34 |     # dispatched, µOPs wait for their operands and issue out-of-order to one of
 35 |     # fifteen issue pipelines. Each issue pipeline can accept one µOP per
 36 |     # cycle."
 37 |     l1ic -> iq: 5 inst/cycle
 38 | 
 39 |     # Source: ARM
 40 |     decode: 5-way Decode
 41 |     iq -> decode
 42 | 
 43 |     # Source: ARM
 44 |     # "3K entries, for increased coverage"
 45 |     mopc: 3072-entry 4-way skewed-associative MOP Cache
 46 |     decode -> mopc
 47 |     bp -> mopc
 48 | 
 49 |     mop: MOP Queue
 50 | 
 51 |     # Source: ARM
 52 |     # "8 Mop fetch from the Mop cache"
 53 |     mopc -> mop: 8 macro ops/cycle
 54 |     decode -> mop: 5 instructions/cycle
 55 | 
 56 |     # Source: ARM
 57 |     # "The dispatch stage can process up to 8 MOPs per cycle and dispatch up to
 58 |     # 16 µOPs per cycle, with the following limitations ..."
 59 |     rename: 8-way Rename {
 60 |       Zero Idiom
 61 |     }
 62 |     mop -> rename
 63 |   }
 64 | 
 65 |   backend: Backend {
 66 |     # Source: ARM
 67 |     # "224 entry instruction window"
 68 |     rob: 224-entry ROB
 69 | 
 70 |     rf: Register File {
 71 |       irf: Integer Register File
 72 | 
 73 |       flagsrf: Flags Register File
 74 | 
 75 |       vrf: FP/Vector Register File
 76 |     }
 77 | 
 78 |     # Source: ARM
 79 |     sched1: ALU Scheduler
 80 | 
 81 |     # Source: ARM
 82 |     pipe1: Pipe \#1 {
 83 |       Branch
 84 |     }
 85 |     rob -> sched1 -> rf -> pipe1
 86 | 
 87 |     # Source: ARM
 88 |     pipe2: Pipe \#2 {
 89 |       Branch
 90 |     }
 91 |     rob -> sched1 -> rf -> pipe2
 92 | 
 93 |     # Source: ARM
 94 |     pipe3: Pipe \#3 {
 95 |       ALU
 96 |     }
 97 |     rob -> sched1 -> rf -> pipe3
 98 | 
 99 |     # Source: ARM
100 |     pipe4: Pipe \#4 {
101 |       ALU
102 |     }
103 |     rob -> sched1 -> rf -> pipe4
104 | 
105 |     # Source: ARM
106 |     pipe5: Pipe \#5 {
107 |       ALU
108 |       MUL
109 |     }
110 |     rob -> sched1 -> rf -> pipe5
111 | 
112 |     # Source: ARM
113 |     pipe6: Pipe \#6 {
114 |       ALU
115 |       MAC
116 |       DIV
117 |     }
118 |     rob -> sched1 -> rf -> pipe6
119 | 
120 |     # Source: ARM
121 |     sched2: FP/Vector Scheduler
122 | 
123 |     # Source: ARM
124 |     pipe7: Pipe \#7 {
125 |       FMUL
126 |       FADD
127 |       FDIV
128 |       Vec ALU
129 |       IMAC
130 |     }
131 |     rob -> sched2 -> rf -> pipe7
132 | 
133 |     # Source: ARM
134 |     pipe8: Pipe \#8 {
135 |       FMUL
136 |       FADD
137 |       Vec ALU
138 |     }
139 |     rob -> sched2 -> rf -> pipe8
140 | 
141 |     # Source: ARM
142 |     pipe9: Pipe \#9 {
143 |       FMUL
144 |       FADD
145 |       FDIV
146 |       Vec ALU
147 |       IMAC
148 |     }
149 |     rob -> sched2 -> rf -> pipe9
150 | 
151 |     # Source: ARM
152 |     pipe10: Pipe \#10 {
153 |       FMUL
154 |       FADD
155 |       Vec ALU
156 |     }
157 |     rob -> sched2 -> rf -> pipe10
158 | 
159 |     # Source: ARM
160 |     sched3: Memory Scheduler
161 | 
162 |     # Source: ARM
163 |     pipe11: Pipe \#11 {
164 |       Store Data
165 |     }
166 |     rob -> sched3 -> rf -> pipe11
167 | 
168 |     # Source: ARM
169 |     pipe12: Pipe \#12 {
170 |       Store Data
171 |     }
172 |     rob -> sched3 -> rf -> pipe12
173 | 
174 |     # Source: ARM
175 |     pipe13: Pipe \#13 {
176 |       Load AGU
177 |       Store AGU
178 |     }
179 |     rob -> sched3 -> rf -> pipe13
180 | 
181 |     # Source: ARM
182 |     pipe14: Pipe \#14 {
183 |       Load AGU
184 |       Store AGU
185 |     }
186 |     rob -> sched3 -> rf -> pipe14
187 | 
188 |     # Source: ARM
189 |     pipe15: Pipe \#15 {
190 |       Load AGU
191 |     }
192 |     rob -> sched3 -> rf -> pipe15
193 | 
194 |     lsu: LSU {
195 |       Load Queue
196 |       Store Queue
197 |       4 load to use integer latency
198 |       6 load to use fp latency
199 |       3 load/cycle
200 |       2 store/cycle
201 |     }
202 | 
203 |     pipe11 -> lsu
204 |     pipe12 -> lsu
205 |     pipe13 -> lsu
206 |     pipe14 -> lsu
207 |     pipe15 -> lsu
208 |   }
209 |   frontend.rename -> backend.rob
210 | 
211 |   mem: Memory {
212 |     l1: L1 DC {
213 |       # Source: ARM
214 |       l1dtlb: 40-entry fully associative L1 DTLB
215 | 
216 |       # Source: ARM
217 |       l1dc: 64KB L1 DC
218 | 
219 |       # Source: ARM
220 |       l2tlb: 2048-entry 8-way associative 4-bank L2 TLB
221 |     }
222 | 
223 |     l2: L2 {
224 |       # Source: ARM
225 |       512KB/1MB 8-way L2 Cache
226 |     }
227 | 
228 |     l1 -> l2
229 | 
230 |     l3: L3 {
231 |     }
232 |     l2 -> l3
233 |   }
234 |   frontend.l1ic -> mem.l2
235 |   backend.lsu -> mem.l1
236 | 
237 |   info: |md
238 |     Drawn by Jiajie Chen @jiegec
239 | 
240 |     Based on data from ARM, JamesAslan, Anandtech and Wikichip
241 |   |
242 | }


--------------------------------------------------------------------------------
/docs/cortex_x1.md:
--------------------------------------------------------------------------------
 1 | # ARM Cortex X1
 2 | 
 3 | ![](./cortex_x1.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Arm's New Cortex-A78 and Cortex-X1 Microarchitectures: An Efficiency and Performance Divergence](https://www.anandtech.com/show/15813/arm-cortex-a78-cortex-x1-cpu-ip-diverging/3)
 8 | - [Arm Cortex-X1: The First From The Cortex-X Custom Program](https://fuse.wikichip.org/news/3543/arm-cortex-x1-the-first-from-the-cortex-x-custom-program/)
 9 | - [ARM Cortex X1 微架构评测（上）：向山进发](https://zhuanlan.zhihu.com/p/619033328)
10 | - [ARM Cortex X1 微架构（下）：向山进发](https://zhuanlan.zhihu.com/p/620310569)
11 | - [Arm® Cortex®‑X1 Core Technical Reference Manual](https://developer.arm.com/documentation/101433/0102)
12 | 


--------------------------------------------------------------------------------
/docs/cortex_x2.d2:
--------------------------------------------------------------------------------
  1 | cpu : ARM Cortex-X2 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |     }
  5 | 
  6 |     l1ic: L1 IC {
  7 |       # Source: ARM
  8 |       l1itlb: 48-entry fully associative L1 ITLB
  9 | 
 10 |       # Source: ARM
 11 |       l1ic: 64KB 4-way L1 IC
 12 |     }
 13 | 
 14 |     fq: Fetch Queue
 15 |     bp -> fq
 16 |     fq -> l1ic
 17 | 
 18 |     iq: Instruction Queue
 19 |     l1ic -> iq
 20 | 
 21 |     decode: 5-way Decode
 22 |     iq -> decode
 23 | 
 24 |     # Source: ARM
 25 |     mopc: 3072-entry 4-way MOP Cache
 26 |     decode -> mopc
 27 |     bp -> mopc
 28 | 
 29 |     mop: MOP Queue
 30 | 
 31 |     mopc -> mop
 32 |     decode -> mop
 33 | 
 34 |     # Source: ARM
 35 |     # "The dispatch stage can process up to 8 MOPs per cycle and dispatch up to
 36 |     # 16 μOPs per cycle, with the following limitations ..."
 37 |     rename: 8-way Rename {
 38 |       Zero Idiom
 39 |     }
 40 |     mop -> rename
 41 |   }
 42 | 
 43 |   backend: Backend {
 44 |     # Source: ARM
 45 |     rob: 288-MOP-entry ROB
 46 | 
 47 |     rf: Register File {
 48 |       irf: Integer Register File
 49 | 
 50 |       flagsrf: Flags Register File
 51 | 
 52 |       vrf: FP/Vector Register File
 53 |     }
 54 | 
 55 |     # Source: ARM
 56 |     sched1: ALU Scheduler
 57 | 
 58 |     # Source: ARM
 59 |     pipe1: Pipe \#1 {
 60 |       Branch
 61 |     }
 62 |     rob -> sched1 -> rf -> pipe1
 63 | 
 64 |     # Source: ARM
 65 |     pipe2: Pipe \#2 {
 66 |       Branch
 67 |     }
 68 |     rob -> sched1 -> rf -> pipe2
 69 | 
 70 |     # Source: ARM
 71 |     pipe3: Pipe \#3 {
 72 |       ALU
 73 |     }
 74 |     rob -> sched1 -> rf -> pipe3
 75 | 
 76 |     # Source: ARM
 77 |     pipe4: Pipe \#4 {
 78 |       ALU
 79 |     }
 80 |     rob -> sched1 -> rf -> pipe4
 81 | 
 82 |     # Source: ARM
 83 |     pipe5: Pipe \#5 {
 84 |       ALU
 85 |       MUL
 86 |     }
 87 |     rob -> sched1 -> rf -> pipe5
 88 | 
 89 |     # Source: ARM
 90 |     pipe6: Pipe \#6 {
 91 |       ALU
 92 |       MAC
 93 |       DIV
 94 |     }
 95 |     rob -> sched1 -> rf -> pipe6
 96 | 
 97 |     # Source: ARM
 98 |     sched2: FP/Vector Scheduler
 99 | 
100 |     # Source: ARM
101 |     pipe7: Pipe \#7 {
102 |       FMUL
103 |       FADD
104 |       FDIV
105 |       Vec ALU
106 |       IMAC
107 |     }
108 |     rob -> sched2 -> rf -> pipe7
109 | 
110 |     # Source: ARM
111 |     pipe8: Pipe \#8 {
112 |       FMUL
113 |       FADD
114 |       Vec ALU
115 |     }
116 |     rob -> sched2 -> rf -> pipe8
117 | 
118 |     # Source: ARM
119 |     pipe9: Pipe \#9 {
120 |       FMUL
121 |       FADD
122 |       FDIV
123 |       Vec ALU
124 |       IMAC
125 |     }
126 |     rob -> sched2 -> rf -> pipe9
127 | 
128 |     # Source: ARM
129 |     pipe10: Pipe \#10 {
130 |       FMUL
131 |       FADD
132 |       Vec ALU
133 |     }
134 |     rob -> sched2 -> rf -> pipe10
135 | 
136 |     # Source: ARM
137 |     sched3: Memory Scheduler
138 | 
139 |     # Source: ARM
140 |     pipe11: Pipe \#11 {
141 |       Store Data
142 |     }
143 |     rob -> sched3 -> rf -> pipe11
144 | 
145 |     # Source: ARM
146 |     pipe12: Pipe \#12 {
147 |       Store Data
148 |     }
149 |     rob -> sched3 -> rf -> pipe12
150 | 
151 |     # Source: ARM
152 |     pipe13: Pipe \#13 {
153 |       Load AGU
154 |       Store AGU
155 |     }
156 |     rob -> sched3 -> rf -> pipe13
157 | 
158 |     # Source: ARM
159 |     pipe14: Pipe \#14 {
160 |       Load AGU
161 |       Store AGU
162 |     }
163 |     rob -> sched3 -> rf -> pipe14
164 | 
165 |     # Source: ARM
166 |     pipe15: Pipe \#15 {
167 |       Load AGU
168 |     }
169 |     rob -> sched3 -> rf -> pipe15
170 | 
171 |     lsu: LSU {
172 |       Load Queue
173 |       Store Queue
174 |     }
175 | 
176 |     pipe11 -> lsu
177 |     pipe12 -> lsu
178 |     pipe13 -> lsu
179 |     pipe14 -> lsu
180 |     pipe15 -> lsu
181 |   }
182 |   frontend.rename -> backend.rob
183 | 
184 |   mem: Memory {
185 |     l1: L1 DC {
186 |       # Source: ARM
187 |       l1dtlb: 48-entry fully associative L1 DTLB
188 | 
189 |       # Source: ARM
190 |       l1dc: 64KB 4-way L1 DC
191 | 
192 |       # Source: ARM
193 |       l2tlb: 2048-entry 8-way L2 TLB
194 |     }
195 | 
196 |     l2: L2 {
197 |       # Source: ARM
198 |       l2dc: 512KB/1MB 8-way 4 bank L2 Cache
199 |     }
200 | 
201 |     l1 -> l2
202 | 
203 |     l3: L3 {
204 |     }
205 |     l2 -> l3
206 |   }
207 |   frontend.l1ic -> mem.l2
208 |   backend.lsu -> mem.l1
209 | 
210 |   info: |md
211 |     Drawn by Jiajie Chen @jiegec
212 | 
213 |     Based on data from Chips and Cheese, ARM and Anandtech
214 |   |
215 | }


--------------------------------------------------------------------------------
/docs/cortex_x2.md:
--------------------------------------------------------------------------------
 1 | # ARM Cortex X2
 2 | 
 3 | ![](./cortex_x2.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Cortex X2: Arm Aims High](https://chipsandcheese.com/2023/10/27/cortex-x2-arm-aims-high/)
 8 | - [Arm® Cortex®‑X2 Core Technical Reference Manual](https://developer.arm.com/documentation/101803/0200)
 9 | - [Arm Announces Mobile Armv9 CPU Microarchitectures: Cortex-X2, Cortex-A710 & Cortex-A510](https://www.anandtech.com/show/16693/arm-announces-mobile-armv9-cpu-microarchitectures-cortexx2-cortexa710-cortexa510/2)
10 | 


--------------------------------------------------------------------------------
/docs/cortex_x3.d2:
--------------------------------------------------------------------------------
  1 | cpu : ARM Cortex-X3 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |     }
  5 | 
  6 |     l1ic: L1 IC {
  7 |       # Source: ARM
  8 |       l1itlb: 48-entry fully-associative L1 ITLB
  9 | 
 10 |       # Source: ARM
 11 |       l1ic: 64KB 4-way L1 IC
 12 |     }
 13 | 
 14 |     fq: Fetch Queue
 15 |     bp -> fq
 16 |     fq -> l1ic
 17 | 
 18 |     iq: Instruction Queue
 19 |     l1ic -> iq
 20 | 
 21 |     decode: 6-way Decode
 22 |     iq -> decode
 23 | 
 24 |     # Source: ARM
 25 |     mopc: 1536-entry 4-way skewed-associative MOP Cache
 26 |     decode -> mopc
 27 |     bp -> mopc
 28 | 
 29 |     mop: MOP Queue
 30 | 
 31 |     mopc -> mop: 8 MOP/cycle
 32 |     decode -> mop: 6 MOP/cycle
 33 | 
 34 |     # Source: ARM
 35 |     # "Cortex-X3 dispatch width: 6 instrs (I$), 8 insts (Mop$)"
 36 |     # "The dispatch stage can process up to 8 MOPs per cycle and dispatch up to
 37 |     # 16 µOPs per cycle, with the following limitations ..."
 38 |     rename: 8-way Rename {
 39 |       Zero Idiom
 40 |     }
 41 |     mop -> rename
 42 |   }
 43 | 
 44 |   backend: Backend {
 45 |     # Source: ARM
 46 |     rob: 320-MOP-entry ROB
 47 | 
 48 |     rf: Register File {
 49 |       irf: Integer Register File
 50 | 
 51 |       flagsrf: Flags Register File
 52 | 
 53 |       vrf: FP/Vector Register File
 54 |     }
 55 | 
 56 |     # Source: ARM
 57 |     sched1: ALU Scheduler
 58 | 
 59 |     # Source: ARM
 60 |     pipe1: Pipe \#1 {
 61 |       Branch
 62 |     }
 63 |     rob -> sched1 -> rf -> pipe1
 64 | 
 65 |     # Source: ARM
 66 |     pipe2: Pipe \#2 {
 67 |       Branch
 68 |     }
 69 |     rob -> sched1 -> rf -> pipe2
 70 | 
 71 |     # Source: ARM
 72 |     pipe3: Pipe \#3 {
 73 |       ALU
 74 |     }
 75 |     rob -> sched1 -> rf -> pipe3
 76 | 
 77 |     # Source: ARM
 78 |     pipe4: Pipe \#4 {
 79 |       ALU
 80 |     }
 81 |     rob -> sched1 -> rf -> pipe4
 82 | 
 83 |     # Source: ARM
 84 |     pipe5: Pipe \#5 {
 85 |       ALU
 86 |     }
 87 |     rob -> sched1 -> rf -> pipe5
 88 | 
 89 |     # Source: ARM
 90 |     pipe6: Pipe \#6 {
 91 |       ALU
 92 |     }
 93 |     rob -> sched1 -> rf -> pipe6
 94 | 
 95 |     # Source: ARM
 96 |     pipe7: Pipe \#7 {
 97 |       ALU
 98 |       MUL
 99 |     }
100 |     rob -> sched1 -> rf -> pipe7
101 | 
102 |     # Source: ARM
103 |     pipe8: Pipe \#8 {
104 |       ALU
105 |       MAC
106 |       DIV
107 |     }
108 |     rob -> sched1 -> rf -> pipe8
109 | 
110 |     # Source: ARM
111 |     sched2: FP/Vector Scheduler
112 | 
113 |     # Source: ARM
114 |     pipe9: Pipe \#9 {
115 |       FMUL
116 |       FADD
117 |       FDIV
118 |       Vec ALU
119 |       IMAC
120 |     }
121 |     rob -> sched2 -> rf -> pipe9
122 | 
123 |     # Source: ARM
124 |     pipe10: Pipe \#10 {
125 |       FMUL
126 |       FADD
127 |       Vec ALU
128 |     }
129 |     rob -> sched2 -> rf -> pipe10
130 | 
131 |     # Source: ARM
132 |     pipe11: Pipe \#11 {
133 |       FMUL
134 |       FADD
135 |       FDIV
136 |       Vec ALU
137 |       IMAC
138 |     }
139 |     rob -> sched2 -> rf -> pipe11
140 | 
141 |     # Source: ARM
142 |     pipe12: Pipe \#12 {
143 |       FMUL
144 |       FADD
145 |       Vec ALU
146 |     }
147 |     rob -> sched2 -> rf -> pipe12
148 | 
149 |     # Source: ARM
150 |     sched3: Memory Scheduler
151 | 
152 |     # Source: ARM
153 |     pipe13: Pipe \#13 {
154 |       Store Data
155 |     }
156 |     rob -> sched3 -> rf -> pipe13
157 | 
158 |     # Source: ARM
159 |     pipe14: Pipe \#14 {
160 |       Store Data
161 |     }
162 |     rob -> sched3 -> rf -> pipe14
163 | 
164 |     # Source: ARM
165 |     pipe15: Pipe \#15 {
166 |       Load AGU
167 |       Store AGU
168 |     }
169 |     rob -> sched3 -> rf -> pipe15
170 | 
171 |     # Source: ARM
172 |     pipe16: Pipe \#16 {
173 |       Load AGU
174 |       Store AGU
175 |     }
176 |     rob -> sched3 -> rf -> pipe16
177 | 
178 |     # Source: ARM
179 |     pipe17: Pipe \#17 {
180 |       Load AGU
181 |     }
182 |     rob -> sched3 -> rf -> pipe17
183 | 
184 |     lsu: LSU {
185 |       Load Queue
186 |       Store Queue
187 |     }
188 | 
189 |     pipe13 -> lsu
190 |     pipe14 -> lsu
191 |     pipe15 -> lsu
192 |     pipe16 -> lsu
193 |     pipe17 -> lsu
194 |   }
195 |   frontend.rename -> backend.rob
196 | 
197 |   mem: Memory {
198 |     l1: L1 DC {
199 |     }
200 | 
201 |     l2: L2 {
202 |     }
203 | 
204 |     l1 -> l2
205 | 
206 |     l3: L3 {
207 |     }
208 |     l2 -> l3
209 |   }
210 |   frontend.l1ic -> mem.l2
211 |   backend.lsu -> mem.l1
212 | 
213 |   info: |md
214 |     Drawn by Jiajie Chen @jiegec
215 | 
216 |     Based on data from ARM and Wikichip
217 |   |
218 | }


--------------------------------------------------------------------------------
/docs/cortex_x3.md:
--------------------------------------------------------------------------------
1 | # ARM Cortex X3
2 | 
3 | ![](./cortex_x3.svg)
4 | 
5 | References:
6 | 
7 | - [Arm® Cortex‑X3 Core Technical Reference Manual](https://developer.arm.com/documentation/101593/latest/)
8 | - [Arm Unveils Next-Gen Flagship Core: Cortex-X3](https://fuse.wikichip.org/news/6855/arm-unveils-next-gen-flagship-core-cortex-x3/)
9 | 


--------------------------------------------------------------------------------
/docs/cortex_x4.d2:
--------------------------------------------------------------------------------
  1 | cpu : ARM Cortex-X4 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |     }
  5 | 
  6 |     l1ic: L1 IC {
  7 |       # Source: ARM
  8 |       l1itlb: 48-entry fully-associative L1 ITLB
  9 | 
 10 |       # Source: ARM
 11 |       l1ic: 64KB 4-way L1 IC
 12 |     }
 13 | 
 14 |     fq: Fetch Queue
 15 |     bp -> fq
 16 |     fq -> l1ic
 17 | 
 18 |     iq: Instruction Queue
 19 |     l1ic -> iq
 20 | 
 21 |     # Source: ARM
 22 |     decode: 10-way Decode
 23 |     iq -> decode
 24 | 
 25 |     # Source: ARM
 26 |     # "Cortex-X4 dispatch width: 10 instrs"
 27 |     # "The dispatch stage can process up to 10 MOPs per cycle and dispatch up to
 28 |     # 20 μOPs per cycle, with the following limitations ..."
 29 |     rename: 10-way Rename {
 30 |       Move Elimination
 31 |       Zero Idiom
 32 |     }
 33 |     decode -> rename
 34 |   }
 35 | 
 36 |   backend: Backend {
 37 |     # Source: ARM
 38 |     # "MCQ capacity: 320x2 -> 384x2"
 39 |     rob: 384-MOP-entry ROB
 40 | 
 41 |     rf: Register File {
 42 |       irf: Integer Register File
 43 | 
 44 |       flagsrf: Flags Register File
 45 | 
 46 |       vrf: FP/Vector Register File
 47 |     }
 48 | 
 49 |     # Source: ARM
 50 |     sched1: ALU Scheduler
 51 | 
 52 |     # Source: ARM
 53 |     pipe1: Pipe \#1 {
 54 |       Branch
 55 |     }
 56 |     rob -> sched1 -> rf -> pipe1
 57 | 
 58 |     # Source: ARM
 59 |     pipe2: Pipe \#2 {
 60 |       Branch
 61 |     }
 62 |     rob -> sched1 -> rf -> pipe2
 63 | 
 64 |     # Source: ARM
 65 |     pipe3: Pipe \#3 {
 66 |       Branch
 67 |     }
 68 |     rob -> sched1 -> rf -> pipe3
 69 | 
 70 |     # Source: ARM
 71 |     pipe4: Pipe \#4 {
 72 |       ALU
 73 |     }
 74 |     rob -> sched1 -> rf -> pipe4
 75 | 
 76 |     # Source: ARM
 77 |     pipe5: Pipe \#5 {
 78 |       ALU
 79 |     }
 80 |     rob -> sched1 -> rf -> pipe5
 81 | 
 82 |     # Source: ARM
 83 |     pipe6: Pipe \#6 {
 84 |       ALU
 85 |     }
 86 |     rob -> sched1 -> rf -> pipe6
 87 | 
 88 |     # Source: ARM
 89 |     pipe7: Pipe \#7 {
 90 |       ALU
 91 |     }
 92 |     rob -> sched1 -> rf -> pipe7
 93 | 
 94 |     # Source: ARM
 95 |     pipe8: Pipe \#8 {
 96 |       ALU
 97 |     }
 98 |     rob -> sched1 -> rf -> pipe8
 99 | 
100 |     # Source: ARM
101 |     pipe9: Pipe \#9 {
102 |       ALU
103 |     }
104 |     rob -> sched1 -> rf -> pipe9
105 | 
106 |     # Source: ARM
107 |     pipe10: Pipe \#10 {
108 |       ALU
109 |       MAC
110 |     }
111 |     rob -> sched1 -> rf -> pipe10
112 | 
113 |     # Source: ARM
114 |     pipe11: Pipe \#11 {
115 |       ALU
116 |       MAC
117 |       DIV
118 |     }
119 |     rob -> sched1 -> rf -> pipe11
120 | 
121 |     # Source: ARM
122 |     sched2: FP/Vector Scheduler
123 | 
124 |     # Source: ARM
125 |     pipe12: Pipe \#12 {
126 |       FMUL
127 |       FADD
128 |       FDIV
129 |       Vec ALU
130 |       IMAC
131 |     }
132 |     rob -> sched2 -> rf -> pipe12
133 | 
134 |     # Source: ARM
135 |     pipe13: Pipe \#13 {
136 |       FMUL
137 |       FADD
138 |       Vec ALU
139 |     }
140 |     rob -> sched2 -> rf -> pipe13
141 | 
142 |     # Source: ARM
143 |     pipe14: Pipe \#14 {
144 |       FMUL
145 |       FADD
146 |       FDIV
147 |       Vec ALU
148 |       IMAC
149 |     }
150 |     rob -> sched2 -> rf -> pipe14
151 | 
152 |     # Source: ARM
153 |     pipe15: Pipe \#15 {
154 |       FMUL
155 |       FADD
156 |       Vec ALU
157 |     }
158 |     rob -> sched2 -> rf -> pipe15
159 | 
160 |     # Source: ARM
161 |     sched3: Memory Scheduler
162 | 
163 |     # Source: ARM
164 |     pipe16: Pipe \#16 {
165 |       Store Data
166 |     }
167 |     rob -> sched3 -> rf -> pipe16
168 | 
169 |     # Source: ARM
170 |     pipe17: Pipe \#17 {
171 |       Store Data
172 |     }
173 |     rob -> sched3 -> rf -> pipe17
174 | 
175 |     # Source: ARM
176 |     pipe18: Pipe \#18 {
177 |       Load AGU
178 |       Store AGU
179 |     }
180 |     rob -> sched3 -> rf -> pipe18
181 | 
182 |     # Source: ARM
183 |     pipe19: Pipe \#19 {
184 |       Load AGU
185 |     }
186 |     rob -> sched3 -> rf -> pipe19
187 | 
188 |     # Source: ARM
189 |     pipe20: Pipe \#20 {
190 |       Load AGU
191 |     }
192 |     rob -> sched3 -> rf -> pipe20
193 | 
194 |     # Source: ARM
195 |     pipe21: Pipe \#21 {
196 |       Store AGU
197 |     }
198 |     rob -> sched3 -> rf -> pipe21
199 | 
200 |     lsu: LSU {
201 |       Load Queue
202 |       Store Queue
203 |     }
204 | 
205 |     pipe16 -> lsu
206 |     pipe17 -> lsu
207 |     pipe18 -> lsu
208 |     pipe19 -> lsu
209 |     pipe20 -> lsu
210 |     pipe21 -> lsu
211 |   }
212 |   frontend.rename -> backend.rob
213 | 
214 |   mem: Memory {
215 |     l1: L1 DC {
216 |       # Source: ARM
217 |       l1dtlb: 96-entry fully-associative L1 DTLB
218 | 
219 |       # Source: ARM
220 |       l1dc: 64KB 4-way L1 DC
221 | 
222 |       # Source: ARM
223 |       l2tlb: 2048-entry 8-way L2 TLB
224 |     }
225 | 
226 |     l2: L2 {
227 |       # Source: ARM
228 |       512KB/1MB/2MB 8-way 4-bank L2 Cache
229 |     }
230 | 
231 |     l1 -> l2
232 | 
233 |     l3: L3 {
234 |       32MB L3 Cache
235 |     }
236 |     l2 -> l3
237 |   }
238 |   frontend.l1ic -> mem.l2
239 |   backend.lsu -> mem.l1
240 | 
241 |   info: |md
242 |     Drawn by Jiajie Chen @jiegec
243 | 
244 |     Based on data from ARM and Anandtech
245 |   |
246 | }


--------------------------------------------------------------------------------
/docs/cortex_x4.md:
--------------------------------------------------------------------------------
1 | # ARM Cortex X4
2 | 
3 | ![](./cortex_x4.svg)
4 | 
5 | References:
6 | 
7 | - [Arm Unveils 2023 Mobile CPU Core Designs: Cortex-X4, A720, and A520 - the Armv9.2 Family](https://www.anandtech.com/show/18871/arm-unveils-armv92-mobile-architecture-cortex-x4-a720-and-a520-64bit-exclusive/2)
8 | - [Arm® Cortex-X4 Core Technical Reference Manual](https://developer.arm.com/documentation/102484/latest/)
9 | 


--------------------------------------------------------------------------------
/docs/cortex_x925.d2:
--------------------------------------------------------------------------------
  1 | cpu : ARM Cortex-X925 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |     }
  5 | 
  6 |     l1ic: L1 IC {
  7 |       # Source: ARM
  8 |       l1itlb: 128-entry fully-associative L1 ITLB
  9 | 
 10 |       # Source: ARM
 11 |       l1ic: 64KB 4-way L1 IC
 12 |     }
 13 | 
 14 |     fq: Fetch Queue
 15 |     bp -> fq
 16 |     fq -> l1ic
 17 | 
 18 |     iq: Instruction Queue
 19 |     l1ic -> iq
 20 | 
 21 |     # Source: ARM
 22 |     decode: 10-way Decode
 23 |     iq -> decode
 24 | 
 25 |     # Source: ARM
 26 |     # "The dispatch stage can process up to 10 MOPs per cycle and dispatch up to
 27 |     # 20 µOPs per cycle, with the following limitations ..."
 28 |     rename: 10-way Rename {
 29 |       Sign Extension Elimination
 30 |       Move Elimination
 31 |       Zero Idiom
 32 |     }
 33 |     decode -> rename
 34 |   }
 35 | 
 36 |   backend: Backend {
 37 |     rob: ROB
 38 | 
 39 |     rf: Register File {
 40 |       irf: Integer Register File
 41 | 
 42 |       flagsrf: Flags Register File
 43 | 
 44 |       vrf: FP/Vector Register File
 45 |     }
 46 | 
 47 |     # Source: ARM
 48 |     sched1: ALU Scheduler
 49 | 
 50 |     # Source: ARM
 51 |     pipe1: Pipe \#1 {
 52 |       Branch
 53 |     }
 54 |     rob -> sched1 -> rf -> pipe1
 55 | 
 56 |     # Source: ARM
 57 |     pipe2: Pipe \#2 {
 58 |       Branch
 59 |     }
 60 |     rob -> sched1 -> rf -> pipe2
 61 | 
 62 |     # Source: ARM
 63 |     pipe3: Pipe \#3 {
 64 |       Branch
 65 |     }
 66 |     rob -> sched1 -> rf -> pipe3
 67 | 
 68 |     # Source: ARM
 69 |     pipe4: Pipe \#4 {
 70 |       ALU
 71 |     }
 72 |     rob -> sched1 -> rf -> pipe4
 73 | 
 74 |     # Source: ARM
 75 |     pipe5: Pipe \#5 {
 76 |       ALU
 77 |       MUL
 78 |     }
 79 |     rob -> sched1 -> rf -> pipe5
 80 | 
 81 |     # Source: ARM
 82 |     pipe6: Pipe \#6 {
 83 |       ALU
 84 |     }
 85 |     rob -> sched1 -> rf -> pipe6
 86 | 
 87 |     # Source: ARM
 88 |     pipe7: Pipe \#7 {
 89 |       ALU
 90 |       MUL
 91 |     }
 92 |     rob -> sched1 -> rf -> pipe7
 93 | 
 94 |     # Source: ARM
 95 |     pipe8: Pipe \#8 {
 96 |       ALU
 97 |     }
 98 |     rob -> sched1 -> rf -> pipe8
 99 | 
100 |     # Source: ARM
101 |     pipe9: Pipe \#9 {
102 |       ALU
103 |       MUL
104 |     }
105 |     rob -> sched1 -> rf -> pipe9
106 | 
107 |     # Source: ARM
108 |     pipe10: Pipe \#10 {
109 |       ALU
110 |       DIV
111 |       CRC
112 |     }
113 |     rob -> sched1 -> rf -> pipe10
114 | 
115 |     # Source: ARM
116 |     pipe11: Pipe \#11 {
117 |       ALU
118 |       MUL
119 |       CRC
120 |     }
121 |     rob -> sched1 -> rf -> pipe11
122 | 
123 |     # Source: ARM
124 |     sched2: FP/Vector Scheduler
125 | 
126 |     # Source: ARM
127 |     pipe12: Pipe \#12 {
128 |       Vec ALU
129 |       Vec INT MUL
130 |       FADD
131 |       FMUL
132 |       Crypto
133 |     }
134 |     rob -> sched2 -> rf -> pipe12
135 | 
136 |     # Source: ARM
137 |     pipe13: Pipe \#13 {
138 |       Vec ALU
139 |       FMUL
140 |       FADD
141 |       FDIV
142 |       FSQRT
143 |       Crypto
144 |     }
145 |     rob -> sched2 -> rf -> pipe13
146 | 
147 |     # Source: ARM
148 |     pipe14: Pipe \#14 {
149 |       Vec ALU
150 |       FMUL
151 |       FADD
152 |     }
153 |     rob -> sched2 -> rf -> pipe14
154 | 
155 |     # Source: ARM
156 |     pipe15: Pipe \#15 {
157 |       Vec ALU
158 |       Vec INT MUL
159 |       FMUL
160 |       FADD
161 |       Crypto
162 |     }
163 |     rob -> sched2 -> rf -> pipe15
164 | 
165 |     # Source: ARM
166 |     pipe16: Pipe \#16 {
167 |       Vec ALU
168 |       FMUL
169 |       FADD
170 |       Crypto
171 |     }
172 |     rob -> sched2 -> rf -> pipe16
173 | 
174 |     # Source: ARM
175 |     pipe17: Pipe \#17 {
176 |       Vec ALU
177 |       FMUL
178 |       FADD
179 |     }
180 |     rob -> sched2 -> rf -> pipe17
181 | 
182 |     # Source: ARM
183 |     sched3: Memory Scheduler
184 | 
185 |     # Source: ARM
186 |     pipe18: Pipe \#18 {
187 |       Store Data
188 |     }
189 |     rob -> sched3 -> rf -> pipe18
190 | 
191 |     # Source: ARM
192 |     pipe19: Pipe \#19 {
193 |       Store Data
194 |     }
195 |     rob -> sched3 -> rf -> pipe19
196 | 
197 |     # Source: ARM
198 |     pipe20: Pipe \#20 {
199 |       Load AGU
200 |       Store AGU
201 |     }
202 |     rob -> sched3 -> rf -> pipe20
203 | 
204 |     # Source: ARM
205 |     pipe21: Pipe \#21 {
206 |       Load AGU
207 |       Store AGU
208 |     }
209 |     rob -> sched3 -> rf -> pipe21
210 | 
211 |     # Source: ARM
212 |     pipe22: Pipe \#22 {
213 |       Load AGU
214 |     }
215 |     rob -> sched3 -> rf -> pipe22
216 | 
217 |     # Source: ARM
218 |     pipe23: Pipe \#23 {
219 |       Load AGU
220 |     }
221 |     rob -> sched3 -> rf -> pipe23
222 | 
223 |     lsu: LSU {
224 |       Load Queue
225 |       Store Queue
226 |     }
227 | 
228 |     pipe18 -> lsu
229 |     pipe19 -> lsu
230 |     pipe20 -> lsu
231 |     pipe21 -> lsu
232 |     pipe22 -> lsu
233 |     pipe23 -> lsu
234 |   }
235 |   frontend.rename -> backend.rob
236 | 
237 |   mem: Memory {
238 |     l1: L1 DC {
239 |       # Source: ARM
240 |       l1dtlb: 96-entry fully-associative L1 DTLB
241 | 
242 |       # Source: ARM
243 |       l1dc: 64KB 4-way L1 DC
244 | 
245 |       # Source: ARM
246 |       l2tlb: 2048-entry 8-way L2 TLB
247 |     }
248 | 
249 |     l2: L2 {
250 |       # Source: ARM
251 |       2MB 8-way or 3MB 12-way, 4-bank L2 Cache
252 |     }
253 | 
254 |     l1 -> l2
255 | 
256 |     l3: L3 {
257 |     }
258 |     l2 -> l3
259 |   }
260 |   frontend.l1ic -> mem.l2
261 |   backend.lsu -> mem.l1
262 | 
263 |   info: |md
264 |     Drawn by Jiajie Chen @jiegec
265 | 
266 |     Based on data from ARM and Anandtech
267 |   |
268 | }


--------------------------------------------------------------------------------
/docs/cortex_x925.md:
--------------------------------------------------------------------------------
 1 | # ARM Cortex X925
 2 | 
 3 | ![](./cortex_x925.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Arm Unveils 2024 CPU Core Designs, Cortex X925, A725 and A520: Arm v9.2 Redefined For 3nm](https://www.anandtech.com/show/21399/arm-unveils-2024-cpu-core-designs-cortex-x925-a725-and-a520-arm-v9-2-redefined-for-3nm-/2)
 8 | - [Arm® Cortex-X925 Core Technical Reference Manual](https://developer.arm.com/documentation/102807/0001)
 9 | - [Arm® Cortex-X925 Core Software Optimization Guide](https://developer.arm.com/documentation/109842/latest/)
10 | - [Cortex X925 微架构评测：路在何方](https://zhuanlan.zhihu.com/p/945571328)
11 | 


--------------------------------------------------------------------------------
/docs/crestmont.d2:
--------------------------------------------------------------------------------
  1 | cpu: Intel Meteor Lake E-core Crestmont CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Chips and Cheese
  5 |       l1btb: 1024-entry L1 BTB
  6 | 
  7 |       # Source: Chips and Cheese, Intel
  8 |       # Intel: "Larger Branch Target Buffer (6K entry from 5K) with Enhanced
  9 |       # Path Based Branch Prediction."
 10 |       l2btb: 6144-way L2 BTB
 11 | 
 12 |       # Source: Chips and Cheese
 13 |       ind: >=512-entry Indirect Target Array
 14 | 
 15 |       # Source: Intel
 16 |       # "Increased Branch Prediction Bandwidth (128B/cycle max from 32B/cycle on
 17 |       # Gracemont)."
 18 |       bw: 128B/cycle prediction bandwidth
 19 |     }
 20 | 
 21 |     l1ic: L1 IC {
 22 |       # Source: Chips and Cheese, Intel
 23 |       # Intel:
 24 |       # Level Entries Associativity     Architectural Page Size Cached Translation Size
 25 |       # ITLB  64      Fully associative All                     4KB, 256KB
 26 |       itlb: 64-entry fully associative ITLB
 27 | 
 28 |       # Source: Chips and Cheese
 29 |       l1ic: 64KB 8-way L1 IC
 30 |     }
 31 | 
 32 |     # Source: Chips and Cheese
 33 |     fq: 2x Fetch Queue
 34 |     bp -> fq
 35 |     fq -> l1ic
 36 | 
 37 |     iq1: Instruction Queue \#1
 38 |     l1ic -> iq1
 39 | 
 40 |     # Source: Chips and Cheese
 41 |     decode1: 3-way Decode \#1
 42 |     iq1 -> decode1
 43 | 
 44 |     # Source: Chips and Cheese
 45 |     uop1: UOP Queue \#1
 46 |     decode1 -> uop1
 47 | 
 48 |     iq2: Instruction Queue \#2
 49 |     l1ic -> iq2
 50 | 
 51 |     # Source: Chips and Cheese
 52 |     decode2: 3-way Decode \#2
 53 |     iq2 -> decode2
 54 | 
 55 |     # Source: Chips and Cheese
 56 |     uop2: UOP Queue \#2
 57 |     decode2 -> uop2
 58 | 
 59 |     # Source: Chips and Cheese, Intel
 60 |     # Intel: "Wider allocation width (6-wide from 5-wide)"
 61 |     rename: 6-way Rename {
 62 |       Move Elimination
 63 |       Zero Idiom
 64 |     }
 65 |     uop1 -> rename
 66 |     uop2 -> rename
 67 |   }
 68 | 
 69 |   backend: Backend {
 70 |     # Source: Chips and Cheese, Intel
 71 |     # Intel: "For the 256-entry retirement buffer, retirement can be up to eight
 72 |     # instructions per cycle."
 73 |     rob: 256-entry ROB, 8-wide retirement
 74 | 
 75 |     # Source: Chips and Cheese
 76 |     bob: 116-taken-entry 126-not-taken-entry Branch Order Buffer
 77 | 
 78 |     rf: Register File {
 79 |       # Source: Chips and Cheese
 80 |       irf: 214-entry Integer Register File
 81 | 
 82 |       # Source: Chips and Cheese
 83 |       vrf: 207-entry 128b Vector Register File
 84 |     }
 85 | 
 86 |     # Source: Chips and Cheese
 87 |     sched1: 16-entry ALU Scheduler \#1
 88 | 
 89 |     # Source: Chips and Cheese
 90 |     pipe1: Port 0 {
 91 |       grid-columns: 1
 92 |       ALU
 93 |     }
 94 |     rob -> sched1 -> rf.irf -> pipe1
 95 | 
 96 |     # Source: Chips and Cheese
 97 |     sched2: 16-entry ALU Scheduler \#2
 98 | 
 99 |     # Source: Chips and Cheese
100 |     pipe2: Port 1 {
101 |       grid-columns: 1
102 |       ALU
103 |       INT MUL
104 |       INT DIV
105 |       LEA
106 |     }
107 |     rob -> sched2 -> rf.irf -> pipe2
108 | 
109 |     # Source: Chips and Cheese
110 |     sched3: 16-entry ALU Scheduler \#3
111 | 
112 |     # Source: Chips and Cheese
113 |     pipe3: Port 2 {
114 |       grid-columns: 1
115 |       ALU
116 |       INT MUL
117 |       INT DIV
118 |       PDEP
119 |     }
120 |     rob -> sched3 -> rf.irf -> pipe3
121 | 
122 |     # Source: Chips and Cheese
123 |     sched4: 16-entry ALU Scheduler \#4
124 | 
125 |     # Source: Chips and Cheese
126 |     pipe4: Port 3 {
127 |       grid-columns: 1
128 |       ALU
129 |     }
130 |     rob -> sched4 -> rf.irf -> pipe4
131 | 
132 |     # Source: Chips and Cheese
133 |     sched5: 42-entry Branch/Store Data Scheduler \#5
134 | 
135 |     # Source: Chips and Cheese
136 |     pipe5: Port 30 {
137 |       JMP/Branch
138 |     }
139 |     rob -> sched5 -> rf.irf -> pipe5
140 | 
141 |     # Source: Chips and Cheese
142 |     pipe6: Port 31 {
143 |       JMP/Branch
144 |     }
145 |     rob -> sched5 -> rf.irf -> pipe6
146 | 
147 |     # Source: Chips and Cheese
148 |     pipe7: Port 12 {
149 |       Store AGU
150 |     }
151 |     rob -> sched5 -> rf.irf -> pipe7
152 | 
153 |     # Source: Chips and Cheese
154 |     pipe8: Port 13 {
155 |       Store AGU
156 |     }
157 |     rob -> sched5 -> rf.irf -> pipe8
158 | 
159 |     # Source: Chips and Cheese
160 |     nsq1: 22-entry AGU Non-Scheduling Queue \#1
161 | 
162 |     # Source: Chips and Cheese
163 |     sched6: 22-entry AGU Scheduler \#6
164 | 
165 |     # Source: Chips and Cheese
166 |     pipe9: Port 10 {
167 |       Load AGU
168 |     }
169 |     rob -> nsq1 -> sched6 -> rf.irf -> pipe9
170 | 
171 |     # Source: Chips and Cheese
172 |     pipe10: Port 11 {
173 |       Load AGU
174 |     }
175 |     rob -> nsq1 -> sched6 -> rf.irf -> pipe10
176 | 
177 |     # Source: Chips and Cheese
178 |     pipe11: Port 8 {
179 |       Store Data
180 |     }
181 |     rob -> nsq1 -> sched6 -> rf.irf -> pipe11
182 | 
183 |     # Source: Chips and Cheese
184 |     pipe12: Port 9 {
185 |       Store Data
186 |     }
187 |     rob -> nsq1 -> sched6 -> rf.irf -> pipe12
188 | 
189 |     lsu: LSU {
190 |       # Source: Chips and Cheese
191 |       80-entry Load Queue
192 |       48-entry Store Queue
193 | 
194 |       # Source: Intel
195 |       # "The Crestmont microarchitecture’s memory subsystem is designed to handle
196 |       # two 16 byte loads and two 16 byte stores per cycle, providing
197 |       # simultaneous 32 bytes of read bandwidth and 32 bytes of write bandwidth
198 |       # per cycle. The load-to-use latency for loads is typically four cycles.
199 |       # Suppose you are doing a pointer-chasing operation where the computed
200 |       # address results from a single prior load and a positive displacement of
201 |       # no more than +1023. In that case, the load-to-use latency observed can
202 |       # be reduced to three cycles."
203 |       2x128b load and 2x128b store per cycle
204 |       4 cycle load to use latency
205 |       3 cycle load to use latency in pointer chasing
206 |     }
207 | 
208 |     pipe7 -> lsu
209 |     pipe8 -> lsu
210 |     pipe9 -> lsu
211 |     pipe10 -> lsu
212 |     pipe11 -> lsu
213 |     pipe12 -> lsu
214 | 
215 |     # Source: Chips and Cheese
216 |     nsq2: 57-entry FP Non-Scheduling Queue \#2
217 | 
218 |     # Source: Chips and Cheese
219 |     sched7: 22-entry FStore Scheduler \#7
220 | 
221 |     # Source: Chips and Cheese
222 |     pipe13: Port 28 {
223 |       FP Store Data
224 |     }
225 |     rob -> nsq2 -> sched7 -> rf.vrf -> pipe13
226 | 
227 |     # Source: Chips and Cheese
228 |     pipe14: Port 29 {
229 |       FP Store Data
230 |     }
231 |     rob -> nsq2 -> sched7 -> rf.vrf -> pipe14
232 | 
233 |     # Source: Chips and Cheese
234 |     sched8: 38-entry FP/Vector Scheduler \#8
235 | 
236 |     # Source: Chips and Cheese
237 |     pipe15: Port 20 {
238 |       grid-columns: 1
239 |       INT Vec ALU
240 |       INT Vec MUL
241 |       FMA
242 |       FADD
243 |       FMUL
244 |       AES
245 |       SHA
246 |     }
247 |     rob -> nsq2 -> sched8 -> rf.vrf -> pipe15
248 | 
249 |     # Source: Chips and Cheese
250 |     pipe16: Port 21 {
251 |       grid-columns: 1
252 |       INT Vec ALU
253 |       INT Vec MUL
254 |       FMA
255 |       FADD
256 |       FMUL
257 |       AES
258 |     }
259 |     rob -> nsq2 -> sched8 -> rf.vrf -> pipe16
260 | 
261 |     # Source: Chips and Cheese
262 |     pipe17: Port 22 {
263 |       INT Vec ALU
264 |     }
265 |     rob -> nsq2 -> sched8 -> rf.vrf -> pipe17
266 |   }
267 | 
268 |   frontend.rename -> backend.rob
269 |   frontend.rename -> backend.bob
270 | 
271 |   mem: Memory {
272 |     l1: L1 DC {
273 |       # Source: Chips and Cheese, Intel
274 |       # Level Entries Associativity     Architectural Page Size Cached Translation Size
275 |       # DTLB  48      Fully associative All                     4KB, 2MB
276 |       l1dtlb: 48-entry fully associative L1 DTLB
277 | 
278 |       # Source: Chips and Cheese
279 |       l1dc: 32KB 8-way L1DC
280 |     }
281 | 
282 |     l2: L2 {
283 |       # Source: Chips and Cheese, Intel
284 |       # Level Entries Associativity     Architectural Page Size Cached Translation Size
285 |       # STLB  3072    6-way             4K/2M/4M      4KB, 2MB
286 |       # STLB  16      Fully associative 1GB           1GB
287 |       l2tlb: 3072-entry 6-way 4K/2M/4M page, 16-entry fully associative 2G page L2 TLB
288 | 
289 |       # Source: Chips and Cheese, Intel
290 |       # "The L2 cache delivers 64 bytes of data per cycle at a latency of 17
291 |       # cycles, and that bandwidth is shared amongst 4 cores."
292 |       l2dc: 2MB 16-way Shared L2 Cache, 17 cycle latency
293 |     }
294 | 
295 |     # Source: Chips and Cheese
296 |     l1 -> l2: 64B/cycle shared among 4 cores
297 | 
298 |     l3: L3 {
299 |       # Source: Chips and Cheese
300 |       l3dc: 24MB 12-way L3 Cache
301 |     }
302 |     l2 -> l3
303 |   }
304 |   frontend.l1ic -> mem.l2
305 |   backend.lsu -> mem.l1
306 | 
307 |   info: |md
308 |     Drawn by Jiajie Chen @jiegec
309 | 
310 |     Based on data from Chips and Cheese
311 |   |
312 | }
313 | 


--------------------------------------------------------------------------------
/docs/crestmont.md:
--------------------------------------------------------------------------------
 1 | # Intel Meteor Lake E-core aka Crestmont
 2 | 
 3 | ![](./crestmont.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Meteor Lake’s E-Cores: Crestmont Makes Incremental Progress](https://chipsandcheese.com/2024/05/13/meteor-lakes-e-cores-crestmont-makes-incremental-progress/)
 8 | - Intel 64 and IA-32 Architectures Optimization Reference Manual Volume 1
 9 | - [The Next Generation of High Performance, Energy-Efficient Computing: Intel® Xeon® Processors Built on Efficient-Core](https://hc2023.hotchips.org/assets/program/conference/day1/CPU2/HC2023.Intel.Soltis.FINAL.pdf)
10 | 


--------------------------------------------------------------------------------
/docs/dieshot.md:
--------------------------------------------------------------------------------
 1 | # Dieshot
 2 | 
 3 | - AMD Ryzen 9 9900x: https://www.bilibili.com/opus/965843745820901377
 4 | - AMD Strix Point: https://www.bilibili.com/opus/959217298443337751
 5 | - Apple A17: https://www.youtube.com/watch?v=veikj5uvAc8
 6 | - Google Tensor Gen1: https://twitter.com/Kurnalsalts/status/1792171201076551747 https://www.bilibili.com/opus/942059110279413798
 7 | - Google Tensor Gen2: https://twitter.com/Kurnalsalts/status/1792171201076551747 https://www.bilibili.com/opus/942059110279413798
 8 | - Google Tensor Gen3: https://twitter.com/Kurnalsalts/status/1799097967884083393 https://www.bilibili.com/opus/942059110279413798
 9 | - Intel Meteor Lake: https://www.bilibili.com/opus/967751673346785305
10 | - MediaTek Dimensity 9200: 1x Cortex-X3, 3x Cortex-A715, 4x Cortex-A510, Immortalis-G715 https://i.mediatek.com/dimensity-9200 https://kurnal.xlog.app/D9200
11 | - MediaTek Dimensity 9300: 4x Cortex-X4, 4x Cortex-A720, Immortalis-G720 https://i.mediatek.com/dimensity-9300 https://twitter.com/Kurnalsalts/status/1767976930446553369
12 | - MediaTek Dimensity 930: 2x Cortex-A78, 6x Cortex-A55 https://i.mediatek.com/dimensity-930 https://twitter.com/Kurnalsalts/status/1727746638574420285
13 | - MediaTek Helio X30: 2x Cortex-A73, 4x Cortex-A53, 4x Cortex-A35 https://www.mediatek.com/products/smartphones-2/mediatek-helio-x30 https://twitter.com/Kurnalsalts/status/1790624036613218466
14 | - Nvidia AD102: https://twitter.com/Kurnalsalts/status/1784611359608680563
15 | - Qualcomm 7 Gen1: 4x(1+3) Cortex-A710, 4x Cortex-A510 https://twitter.com/Kurnalsalts/status/1776276658426966068
16 | - Qualcomm 8 Gen2: 1x Cortex-X3, 2x Cortex-A715, 2x Cortex-A710, 3x Cortex-A510 https://twitter.com/Kurnalsalts/status/1705935348893905147 https://kurnal.xlog.app/SM8550
17 | - Qualcomm 8 Gen3: 1x Cortex-X4, 5x(2+3) Cortex-A720, 2x Cortex-A520 https://twitter.com/Kurnalsalts/status/1776276658426966068 https://twitter.com/Kurnalsalts/status/1776201339615514721
18 | - Qualcomm 845: https://kurnal.xlog.app/SDM845
19 | - Qualcomm 888: https://twitter.com/Kurnalsalts/status/1727744875305832710
20 | - Qualcomm 8s Gen3: https://t.bilibili.com/916817677665697796 https://twitter.com/Kurnalsalts/status/1776276658426966068 https://twitter.com/Kurnalsalts/status/1776201339615514721
21 | - Qualcomm X Elite: https://tieba.baidu.com/p/9194576062 https://chipwise.tech/our-portfolio/snapdragon-x-elite/
22 | - Qualcomm 8 Elite: https://x.com/Kurnalsalts/status/1848700612181168601
23 | - Samsung Exynos 2100: https://twitter.com/Kurnalsalts/status/1784620815474135151
24 | - Samsung Exynos 2200: https://twitter.com/Kurnalsalts/status/1785040012188471347 https://www.bilibili.com/opus/942039658357850146
25 | - Samsung Exynos 2300: https://www.bilibili.com/opus/942039658357850146
26 | - Samsung Exynos 2400: https://twitter.com/Kurnalsalts/status/1785252470408773986 https://www.bilibili.com/opus/942039658357850146
27 | 


--------------------------------------------------------------------------------
/docs/firestorm.d2:
--------------------------------------------------------------------------------
  1 | cpu : Apple M1 Firestorm CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: JamesAslan, Chips and Cheese
  5 |       # Source: jiegec, see /data/firestorm/ras.csv
  6 |       l1btb: 1024-entry L1 BTB, 1 cycle latency
  7 | 
  8 |       # Source: JamesAslan
  9 |       # Source: jiegec, see hw.perflevel0.l1icachesize in /data/firestorm/cache.txt
 10 |       l2btb: 192KB L1 IC as L2 BTB, 3 cycle latency
 11 | 
 12 |       # Source: jiegec, see /data/firestorm/ras.csv
 13 |       ras: 50-entry RAS
 14 |     }
 15 | 
 16 |     # Coupled Frontend
 17 |     l1ic: L1 IC {
 18 |       # Source: Dougall Johnson
 19 |       # Source: jiegec, see hw.perflevel0.l1icachesize in /data/firestorm/cache.txt
 20 |       l1ic: 192KB L1 IC
 21 |     }
 22 | 
 23 |     bp -> l1ic
 24 | 
 25 |     # Source: Dougall Johnson
 26 |     decode: 8-way Decode
 27 |     # Source: jiegec
 28 |     l1ic -> decode: 16 inst/cycle
 29 |     decode -> bp
 30 | 
 31 |     # Source: Dougall Johnson
 32 |     rename: 8-way Rename
 33 |     decode -> rename
 34 |   }
 35 | 
 36 |   backend: Backend {
 37 |     # Source: Dougall Johnson
 38 |     # Source: jiegec, ~325
 39 |     rob: ~330-entry Coalesced ROB
 40 | 
 41 |     # Source: Dougall Johnson
 42 |     # Source: jiegec, ~620
 43 |     rename: ~623-entry Rename History
 44 | 
 45 |     rf: Register File {
 46 |       # Source: Dougall Johnson
 47 |       irf: ~380-entry Integer Register File
 48 | 
 49 |       # Source: Dougall Johnson
 50 |       vrf: ~432-entry 128b Vector Register File
 51 | 
 52 |       # Source: Dougall Johnson
 53 |       flagsrf: ~128-entry Flags Register File
 54 |     }
 55 | 
 56 |     # Source: Dougall Johnson
 57 |     # Source: jiegec, 158(=12+12+24+26+16+12+28+28) sched size for alu
 58 |     # Source: jiegec, 133(~=134=24+26+16+12+28+28) sched w/o nsq size for alu
 59 |     # Source: jiegec, 68(12+28+28) sched size for imul
 60 |     # Source: jiegec, 56(28+28) sched w/o nsq size for imul
 61 |     dispatch1: 12-entry ALU Dispatch Queue \#1
 62 | 
 63 |     # Source: Dougall Johnson
 64 |     sched1: 24-entry Scheduler \#1
 65 | 
 66 |     # Source: Dougall Johnson
 67 |     pipe1: Pipe \#1 {
 68 |       ALU
 69 |       FLAGS
 70 |       B/BL/ADR
 71 |       MOV NZCV
 72 |       MRS
 73 |     }
 74 |     dispatch1 -> sched1 -> rf.irf -> pipe1
 75 | 
 76 |     # Source: Dougall Johnson
 77 |     sched2: 26-entry Scheduler \#2
 78 | 
 79 |     # Source: Dougall Johnson
 80 |     pipe2: Pipe \#2 {
 81 |       ALU
 82 |       FLAGS
 83 |       B/BL/ADR
 84 |       MOV NZCV
 85 |       PTRAUTH
 86 |       BR/BLR
 87 |     }
 88 |     dispatch1 -> sched2 -> rf.irf -> pipe2
 89 | 
 90 |     # Source: Dougall Johnson
 91 |     sched3: 16-entry Scheduler \#3
 92 | 
 93 |     # Source: Dougall Johnson
 94 |     pipe3: Pipe \#3 {
 95 |       ALU
 96 |       FLAGS
 97 |       FROM FP
 98 |     }
 99 |     dispatch1 -> sched3 -> rf.irf -> pipe3
100 | 
101 |     # Source: Dougall Johnson
102 |     dispatch2: 12-entry ALU Dispatch Queue \#2
103 | 
104 |     # Source: Dougall Johnson
105 |     sched4: 12-entry Scheduler \#4
106 | 
107 |     # Source: Dougall Johnson
108 |     pipe4: Pipe \#4 {
109 |       ALU
110 |       FROM FP
111 |     }
112 |     dispatch2 -> sched4 -> rf.irf -> pipe4
113 | 
114 |     # Source: Dougall Johnson
115 |     sched5: 28-entry Scheduler \#5
116 | 
117 |     # Source: Dougall Johnson
118 |     pipe5: Pipe \#5 {
119 |       ALU
120 |       MUL
121 |       DIV
122 |     }
123 |     dispatch2 -> sched5 -> rf.irf -> pipe5
124 | 
125 |     # Source: Dougall Johnson
126 |     sched6: 28-entry Scheduler \#6
127 | 
128 |     # Source: Dougall Johnson
129 |     pipe6: Pipe \#6 {
130 |       ALU
131 |       MUL
132 |       MADD
133 |       BFM
134 |       CRC
135 |     }
136 |     dispatch2 -> sched6 -> rf.irf -> pipe6
137 | 
138 |     # Source: Dougall Johnson
139 |     # Source: jiegec, 58(10+48) sched size for load & store
140 |     # Source: jiegec, 48 sched size for load & store
141 |     dispatch3: 10-entry Memory Dispatch Queue \#3
142 | 
143 |     # Source: Dougall Johnson
144 |     sched7: 48-entry Memory Scheduler \#7
145 | 
146 |     # Source: Dougall Johnson
147 |     pipe7: Pipe \#7 {
148 |       STORE
149 |       AMX
150 |     }
151 |     dispatch3 -> sched7 -> rf.irf -> pipe7
152 | 
153 |     # Source: Dougall Johnson
154 |     pipe8: Pipe \#8 {
155 |       LOAD
156 |       STORE
157 |       AMX
158 |     }
159 |     dispatch3 -> sched7 -> rf.irf -> pipe8
160 | 
161 |     # Source: Dougall Johnson
162 |     pipe9: Pipe \#9 {
163 |       LOAD
164 |     }
165 |     dispatch3 -> sched7 -> rf.irf -> pipe9
166 | 
167 |     # Source: Dougall Johnson
168 |     pipe10: Pipe \#10 {
169 |       LOAD
170 |     }
171 |     dispatch3 -> sched7 -> rf.irf -> pipe10
172 | 
173 |     lsu: LSU {
174 |       # Source: Dougall Johnson
175 |       # See https://dougallj.wordpress.com/2021/04/08/apple-m1-load-and-store-queue-measurements/
176 |       # Source: jiegec, see /data/firestorm/lsu.csv:
177 |       # load: spike at 130(pattern 4) and 188(pattern5, 188=130+48(scheduler)+10(dispatch queue))
178 |       130-entry Load Queue
179 |       # store: spike at 107(pattern 2 & 6, 107=60+48(scheduler)-1) and 118(pattern 7, 60+48(scheduler)+10(dispatch queue))
180 |       # 60 was computed by 118 - 48(scheduler) - 10(dispatch queue)
181 |       60-entry Store Queue
182 |       # Source: jiegec
183 |       2 Load Pipe
184 |       1 Store Pipe
185 |       1 Load/Store Pipe
186 |       # Source: jiegec
187 |       4 cycle load to use latency
188 |       3 cycle load to use latency in pointer chasing
189 |     }
190 | 
191 |     pipe7 -> lsu
192 |     pipe8 -> lsu
193 |     pipe9 -> lsu
194 |     pipe10 -> lsu
195 | 
196 |     rob -> dispatch1
197 |     rob -> dispatch2
198 |     rob -> dispatch3
199 | 
200 |     # Source: Dougall Johnson
201 |     # Source: jiegec, 156(=12+36+36+36+36) sched size for fp
202 |     # Source: jiegec, 143(~=144=36+36+36+36) sched w/o nsq size for fp
203 |     dispatch4: 12-entry FP Dispatch Queue \#4
204 | 
205 |     # Source: Dougall Johnson
206 |     sched8: 36-entry Scheduler \#8
207 | 
208 |     # Source: Dougall Johnson
209 |     pipe11: Pipe \#11 {
210 |       FP/SIMD
211 |     }
212 |     dispatch4 -> sched8 -> rf.vrf -> pipe11
213 | 
214 |     # Source: Dougall Johnson
215 |     sched9: 36-entry Scheduler \#9
216 | 
217 |     # Source: Dougall Johnson
218 |     pipe12: Pipe \#12 {
219 |       FP/SIMD
220 |     }
221 |     dispatch4 -> sched9 -> rf.vrf -> pipe12
222 | 
223 |     # Source: Dougall Johnson
224 |     sched10: 36-entry Scheduler \#10
225 | 
226 |     # Source: Dougall Johnson
227 |     pipe13: Pipe \#13 {
228 |       FP/SIMD
229 |       FCSEL
230 |       TO INT
231 |     }
232 |     dispatch4 -> sched10 -> rf.vrf -> pipe13
233 | 
234 |     # Source: Dougall Johnson
235 |     sched11: 36-entry Scheduler \#11
236 | 
237 |     # Source: Dougall Johnson
238 |     pipe14: Pipe \#14 {
239 |       FP/SIMD
240 |       FCSEL
241 |       TO INT
242 |       DIV/RECP
243 |       SQRT/SHA
244 |       JCVTZS
245 |     }
246 |     dispatch4 -> sched11 -> rf.vrf -> pipe14
247 |     rob -> dispatch4
248 |   }
249 |   frontend.rename -> backend.rob
250 |   frontend.rename -> backend.rename
251 | 
252 |   mem: Memory {
253 |     l1: L1 DC {
254 |       # Source: Anandtech
255 |       l1dtlb: 256-entry L1 DTLB
256 | 
257 |       # Source: Dougall Johnson
258 |       # Source: jiegec, see hw.perflevel0.l1dcachesize in /data/firestorm/cache.txt
259 |       l1dc: 128KB 8-way L1DC
260 |     }
261 | 
262 |     l2: L2 {
263 |       # Source: Anandtech
264 |       l2tlb: 3072-entry L2 TLB
265 | 
266 |       # Source: Dougall Johnson
267 |       l2dc: 12MB L2 Cache per 4-Core cluster
268 |     }
269 |     l1 -> l2
270 |   }
271 |   frontend.l1ic -> mem.l2
272 |   backend.lsu -> mem.l1
273 | 
274 |   info: |md
275 |     Drawn by Jiajie Chen @jiegec
276 | 
277 |     Based on data from Chips and Cheese, Dougall Johnson, JamesAslan and Anandtech
278 |   |
279 | }


--------------------------------------------------------------------------------
/docs/firestorm.md:
--------------------------------------------------------------------------------
 1 | # Apple M1 P-core aka Firestorm
 2 | 
 3 | ![](./firestorm.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Apple Microarchitecture Research by Dougall Johnson](https://dougallj.github.io/applecpu/firestorm.html)
 8 | - [不为人知的角落，Apple M2 的小小努力（其一） - JamesAslan](https://zhuanlan.zhihu.com/p/662561990)
 9 | - [Apple Announces The Apple Silicon M1: Ditching x86 - What to Expect, Based on A14 - Anandtech](https://www.anandtech.com/show/16226/apple-silicon-m1-a14-deep-dive)
10 | - [Exploration of Apple CPUs](https://github.com/name99-org/AArch64-Explore)
11 | - [Apple M1 Icestorm 微架构评测（上）:重铸小核荣光](https://zhuanlan.zhihu.com/p/611213899)
12 | - [Apple M1 Icestorm 微架构（下）:重铸小核荣光](https://zhuanlan.zhihu.com/p/613097964)
13 | - [苹果的黑魔法？Apple M1 的栈操作消除（上）](https://zhuanlan.zhihu.com/p/595582920)
14 | - [苹果的黑魔法？（下）Apple M1 的栈操作消除](https://zhuanlan.zhihu.com/p/600349467)
15 | - [Apple Firestorm/Icestorm CPU microarchitecture docs](https://github.com/dougallj/applecpu)
16 | - [The 2020 Mac Mini Unleashed: Putting Apple Silicon M1 To The Test](https://www.anandtech.com/show/16252/mac-mini-apple-m1-tested)
17 | 


--------------------------------------------------------------------------------
/docs/golden_cove.md:
--------------------------------------------------------------------------------
 1 | # Intel Alder Lake P-core aka Golden Cove
 2 | 
 3 | ![](./golden_cove.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Popping the Hood on Golden Cove](https://chipsandcheese.com/2021/12/02/popping-the-hood-on-golden-cove/)
 8 | - [Intel Alder Lake CPU Architectures](https://ieeexplore.ieee.org/document/9747991)
 9 | - [Golden Cove](https://en.wikipedia.org/wiki/Golden_Cove)
10 | - [Golden Cove’s Vector Register File: Checking with Official (SPR) Data](https://chipsandcheese.com/2023/01/15/golden-coves-vector-register-file-checking-with-official-spr-data/)
11 | - [4th Gen Intel Xeon Scalable Sapphire Rapids Leaps Forward](https://www.servethehome.com/4th-gen-intel-xeon-scalable-sapphire-rapids-leaps-forward/7/)
12 | - [Intel Details Golden Cove: Next-Generation Big Core For Client and Server SoCs](https://fuse.wikichip.org/news/6111/intel-details-golden-cove-next-generation-big-core-for-client-and-server-socs/)
13 | - [Sapphire Rapids: Golden Cove Hits Servers](https://chipsandcheese.com/2023/03/12/a-peek-at-sapphire-rapids/)
14 | - [Golden Cove’s Lopsided Vector Register File](https://chipsandcheese.com/2022/12/25/golden-coves-lopsided-vector-register-file/)
15 | - [Alder Lake Architecture on Hot Chips 33](https://hc33.hotchips.org/assets/program/conference/day1/HC2021.C1.1%20Intel%20Efraim%20Rotem.pdf)
16 | - [Sapphire Rapids on Hot Chips 33](https://hc33.hotchips.org/assets/program/conference/day1/HC2021.C1.4%20Intel%20Arijit.pdf)
17 | - Intel 64 and IA-32 Architectures Optimization Reference Manual Volume 1
18 | - [Sapphire Rapids Dieshot](https://www.bilibili.com/video/BV1nb421J7jy/)
19 | 


--------------------------------------------------------------------------------
/docs/gracemont.d2:
--------------------------------------------------------------------------------
  1 | cpu : Intel Alder Lake E-core Gracemont CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Chips and Cheese
  5 |       l1btb: 1024-entry L1 BTB
  6 | 
  7 |       # Source: Chips and Cheese, Intel
  8 |       # Intel: "he NLP is backed by the second predictor that includes a 5K
  9 |       # entry target array combined with path-based information to make
 10 |       # predictions and verify target addresses in three cycles."
 11 |       l2btb: 5120-way L2 BTB, 3 cycle latency
 12 | 
 13 |       # Source: Chips and Cheese
 14 |       ind: >=512-entry Indirect Target Array
 15 | 
 16 |       # Source: Intel
 17 |       bw: 32B/cycle prediction bandwidth
 18 |     }
 19 | 
 20 |     l1ic: L1 IC {
 21 |       # Source: Chips and Cheese
 22 |       itlb: 64-entry fully associative ITLB
 23 | 
 24 |       # Source: Chips and Cheese, Intel
 25 |       # Intel: "Larger 64KB Instruction Cache with dual 32B reads (32B read per
 26 |       # fetch cluster)."
 27 |       l1ic: 64KB 8-way L1 IC, 2x32 B/cycle
 28 |     }
 29 | 
 30 |     # Source: Chips and Cheese
 31 |     fq: 2x Fetch Queue
 32 |     bp -> fq
 33 |     fq -> l1ic
 34 | 
 35 |     iq1: Instruction Queue \#1
 36 |     l1ic -> iq1
 37 | 
 38 |     # Source: Chips and Cheese
 39 |     decode1: 3-way Decode \#1
 40 |     iq1 -> decode1
 41 | 
 42 |     # Source: Chips and Cheese
 43 |     uop1: UOP Queue \#1
 44 |     decode1 -> uop1
 45 | 
 46 |     iq2: Instruction Queue \#2
 47 |     l1ic -> iq2
 48 | 
 49 |     # Source: Chips and Cheese
 50 |     decode2: 3-way Decode \#2
 51 |     iq2 -> decode2
 52 | 
 53 |     # Source: Chips and Cheese
 54 |     uop2: UOP Queue \#2
 55 |     decode2 -> uop2
 56 | 
 57 |     # Source: Chips and Cheese, Intel
 58 |     rename: 5-way Rename {
 59 |       Move Elimination
 60 |       Zero Idiom
 61 |     }
 62 |     uop1 -> rename
 63 |     uop2 -> rename
 64 |   }
 65 | 
 66 |   backend: Backend {
 67 |     # Source: Chips and Cheese, Intel
 68 |     # Intel: "Retirement can be up to eight instructions per cycle for the
 69 |     # 256-entry retirement buffer."
 70 |     rob: 256-entry ROB, 8-wide retirement
 71 | 
 72 |     # Source: Chips and Cheese
 73 |     bob: 116-taken-entry 126-not-taken-entry Branch Order Buffer
 74 | 
 75 |     rf: Register File {
 76 |       # Source: Chips and Cheese
 77 |       irf: 214-entry Integer Register File
 78 | 
 79 |       # Source: Chips and Cheese
 80 |       vrf: 207-entry 128b Vector Register File
 81 |     }
 82 | 
 83 |     # Source: Chips and Cheese
 84 |     sched1: 15-entry ALU Scheduler \#1
 85 | 
 86 |     # Source: Chips and Cheese, Intel
 87 |     pipe1: Port 0 {
 88 |       grid-columns: 1
 89 |       ALU
 90 |       SHIFT
 91 |     }
 92 |     rob -> sched1 -> rf.irf -> pipe1
 93 | 
 94 |     # Source: Chips and Cheese
 95 |     sched2: 16-entry ALU Scheduler \#2
 96 | 
 97 |     # Source: Chips and Cheese
 98 |     pipe2: Port 1 {
 99 |       grid-columns: 1
100 |       ALU
101 |       SHIFT
102 |       INT MUL
103 |       INT DIV
104 |     }
105 |     rob -> sched2 -> rf.irf -> pipe2
106 | 
107 |     # Source: Chips and Cheese
108 |     sched3: 16-entry ALU Scheduler \#3
109 | 
110 |     # Source: Chips and Cheese, Intel
111 |     pipe3: Port 2 {
112 |       grid-columns: 1
113 |       ALU
114 |       SHIFT
115 |       INT MUL
116 |       INT DIV
117 |       PDEP
118 |     }
119 |     rob -> sched3 -> rf.irf -> pipe3
120 | 
121 |     # Source: Chips and Cheese
122 |     sched4: 15-entry ALU Scheduler \#4
123 | 
124 |     # Source: Chips and Cheese, Intel
125 |     pipe4: Port 3 {
126 |       grid-columns: 1
127 |       ALU
128 |       SHIFT
129 |     }
130 |     rob -> sched4 -> rf.irf -> pipe4
131 | 
132 |     # Source: Chips and Cheese
133 |     # Intel: "The fifth integer reservation station holds jumps and store data
134 |     # operations. This structure is banked and can schedule two uops of each
135 |     # type every cycle; two store data on ports 08 and 09, and two jumps on
136 |     # ports 30 and 31"
137 |     sched5: 42-entry Branch/Store Data Scheduler \#5
138 | 
139 |     # Source: Chips and Cheese, Intel
140 |     pipe5: Port 30 {
141 |       JMP/Branch
142 |     }
143 |     rob -> sched5 -> rf.irf -> pipe5
144 | 
145 |     # Source: Chips and Cheese, Intel
146 |     pipe6: Port 31 {
147 |       JMP/Branch
148 |     }
149 |     rob -> sched5 -> rf.irf -> pipe6
150 | 
151 |     # new Chips and Cheese new post (Meteor Lake’s E-Cores: Crestmont Makes Incremental Progress)
152 |     # the scheduler organization was different:
153 |     # "Intel’s optimization guide says Crestmont and Gracemont serve store data
154 |     # and jump operations with the same queue. I missed this in 2021 because
155 |     # that possibility didn’t cross my mind."
156 |     # Source: Chips and Cheese, Intel
157 |     pipe7: Port 12 {
158 |       Store AGU
159 |     }
160 |     rob -> sched5 -> rf.irf -> pipe7
161 | 
162 |     # Source: Chips and Cheese, Intel
163 |     pipe8: Port 13 {
164 |       Store AGU
165 |     }
166 |     rob -> sched5 -> rf.irf -> pipe8
167 | 
168 |     # Source: Chips and Cheese
169 |     nsq1: 22-entry AGU Non-Scheduling Queue \#1
170 | 
171 |     # Source: Chips and Cheese
172 |     sched6: 22-entry AGU Scheduler \#6
173 | 
174 |     # Source: Chips and Cheese, Intel
175 |     pipe9: Port 10 {
176 |       Load AGU
177 |     }
178 |     rob -> nsq1 -> sched6 -> rf.irf -> pipe9
179 | 
180 |     # Source: Chips and Cheese, Intel
181 |     pipe10: Port 11 {
182 |       Load AGU
183 |     }
184 |     rob -> nsq1 -> sched6 -> rf.irf -> pipe10
185 | 
186 |     # Source: Chips and Cheese, Intel
187 |     pipe11: Port 8 {
188 |       Store Data
189 |     }
190 |     rob -> nsq1 -> sched6 -> rf.irf -> pipe11
191 | 
192 |     # Source: Chips and Cheese, Intel
193 |     pipe12: Port 9 {
194 |       Store Data
195 |     }
196 |     rob -> nsq1 -> sched6 -> rf.irf -> pipe12
197 | 
198 |     lsu: LSU {
199 |       # Source: Chips and Cheese
200 |       80-entry Load Queue
201 |       50-entry Store Queue
202 | 
203 |       # Source: jiegec
204 |       4 cycle load to use latency
205 |       3 cycle load to use latency in pointer chasing
206 |     }
207 | 
208 |     pipe7 -> lsu
209 |     pipe8 -> lsu
210 |     pipe9 -> lsu
211 |     pipe10 -> lsu
212 |     pipe11 -> lsu
213 |     pipe12 -> lsu
214 | 
215 |     # Source: Chips and Cheese
216 |     nsq2: 56-entry FP Non-Scheduling Queue \#2
217 | 
218 |     # Source: Chips and Cheese
219 |     sched7: 18-entry FStore Scheduler \#7
220 | 
221 |     # Source: Chips and Cheese, Intel
222 |     pipe13: Port 28 {
223 |       FP Store Data
224 |     }
225 |     rob -> nsq2 -> sched7 -> rf.vrf -> pipe13
226 | 
227 |     # Source: Chips and Cheese, Intel
228 |     pipe14: Port 29 {
229 |       FP Store Data
230 |     }
231 |     rob -> nsq2 -> sched7 -> rf.vrf -> pipe14
232 | 
233 |     # Source: Chips and Cheese
234 |     sched8: 35-entry FP/Vector Scheduler \#8
235 | 
236 |     # Source: Chips and Cheese, Intel
237 |     pipe15: Port 20 {
238 |       grid-columns: 1
239 |       INT Vec ALU
240 |       INT Vec MUL
241 |       FMA
242 |       FADD
243 |       FMUL
244 |       AES
245 |       FDIV
246 |       SHA
247 |     }
248 |     rob -> nsq2 -> sched8 -> rf.vrf -> pipe15
249 | 
250 |     # Source: Chips and Cheese, Intel
251 |     pipe16: Port 21 {
252 |       grid-columns: 1
253 |       INT Vec ALU
254 |       FMA
255 |       FADD
256 |       FMUL
257 |       AES
258 |     }
259 |     rob -> nsq2 -> sched8 -> rf.vrf -> pipe16
260 | 
261 |     # Source: Chips and Cheese, Intel
262 |     pipe17: Port 22 {
263 |       INT Vec ALU
264 |     }
265 |     rob -> nsq2 -> sched8 -> rf.vrf -> pipe17
266 |   }
267 | 
268 |   frontend.rename -> backend.rob
269 |   frontend.rename -> backend.bob
270 | 
271 |   mem: Memory {
272 |     l1: L1 DC {
273 |       # Source: Chips and Cheese
274 |       # Chips and Cheese: 48-entry
275 |       # Intel: 32-entry
276 |       l1dtlb: 32-entry fully associative L1 DTLB
277 | 
278 |       # Source: Chips and Cheese
279 |       # Intel: "4-cycle load-to-use latency."
280 |       l1dc: 32KB 8-way L1DC, 4 cycle load-to-use latency
281 |     }
282 | 
283 |     l2: L2 {
284 |       # Source: Chips and Cheese, Intel
285 |       # Intel: "The main STLB is 2048 entries 4-way set associative and caches
286 |       # 4KB and 2MB translations. Additionally, Gracemont microarchitecture has
287 |       # an 8-entry fully associative structure for GB translations"
288 |       l2tlb: 2048-entry 4-way 4K/2M/4M page, 8-entry fully associative 1G page L2 TLB
289 | 
290 |       # Source: Chips and Cheese, Intel
291 |       # Intel: "The L2 cache delivers 64 bytes of data per cycle at a latency of
292 |       # 17 cycles, and that bandwidth is shared among four cores."
293 |       l2dc: 2MB 16-way Shared L2 Cache, 17 cycle latency
294 | 
295 |       # Source: Intel
296 |       # Intel: "Pipelined Page Miss Handler capable of handling 4 concurrent
297 |       # page walks."
298 |       ptw: 4 page table walkers
299 |     }
300 | 
301 |     # Source: Chips and Cheese, Intel
302 |     l1 -> l2: 64B/cycle shared among 4 cores
303 | 
304 |     l3: L3 {
305 |       # Source: Chips and Cheese
306 |       l3dc: 30MB 12-way L3 Cache
307 |     }
308 |     l2 -> l3
309 |   }
310 |   frontend.l1ic -> mem.l2
311 |   backend.lsu -> mem.l1
312 | 
313 |   info: |md
314 |     Drawn by Jiajie Chen @jiegec
315 | 
316 |     Based on data from Chips and Cheese, Intel
317 |   |
318 | }


--------------------------------------------------------------------------------
/docs/gracemont.md:
--------------------------------------------------------------------------------
 1 | # Intel Alder Lake E-core aka Gracemont
 2 | 
 3 | ![](./gracemont.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Gracemont: Revenge of the Atom Cores](https://chipsandcheese.com/2021/12/21/gracemont-revenge-of-the-atom-cores/)
 8 | - [Intel Alder Lake CPU Architectures](https://ieeexplore.ieee.org/document/9747991)
 9 | - [Intel’s Gracemont Small Core Eclipses Last-Gen Big Core Performance](https://fuse.wikichip.org/news/6102/intels-gracemont-small-core-eclipses-last-gen-big-core-performance/)
10 | - [Meteor Lake’s E-Cores: Crestmont Makes Incremental Progress](https://chipsandcheese.com/2024/05/13/meteor-lakes-e-cores-crestmont-makes-incremental-progress/)
11 | - Intel 64 and IA-32 Architectures Optimization Reference Manual Volume 1
12 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # CPU Microarchitecture Diagrams
 2 | 
 3 | Microarchitecture diagrams of several CPUs.
 4 | 
 5 | Major microarchitectures of CPU vendors:
 6 | 
 7 | 1. AMD: Zen 1 -> Zen 2 -> Zen 3 -> Zen 4 -> Zen 5
 8 | 2. ARM:
 9 | 	- Cortex-A73
10 | 	- Cortex-A75
11 | 	- Cortex-A76/Neoverse-N1
12 | 	- Cortex-A77
13 | 	- Cortex-A78/Cortex-X1/Neoverse-V1
14 | 	- Cortex-A510/Cortex-A710/Cortex-X2/Neoverse-N2
15 | 	- Cortex-A715/Cortex-X3/Neoverse-V2
16 | 	- Cortex-A520/Cortex-A720/Cortex-X4/Neoverse-V3
17 | 	- Cortex-A725/Cortex-X925/Neoverse-N3
18 | 3. Apple:
19 | 	- M1/A14 (Firestorm + Icestorm)
20 | 	- M2/A15 (Avalanche + Blizzard)
21 | 	- A16 (Everest + Sawtooth)
22 | 	- M3
23 | 	- A17
24 | 	- M4
25 | 	- A18
26 | 4. Intel:
27 | 	- Skylake
28 | 	- Sunny Cove (Ice Lake)
29 | 	- Golden Cove + Gracemont (Alder Lake/Sapphire Rapids) -> Raptor Cove + Gracemont (Raptor Lake/Emerald Rapids)
30 | 	- Redwood Cove + Crestmont (Meteor Lake/Granite Rapids/Sierra Forest)
31 | 	- Lion Cove + Skymont (Lunar Lake/Arrow Lake/Clearwater Forest)
32 | 5. Qualcomm: Oryon
33 | 


--------------------------------------------------------------------------------
/docs/lion_cove.d2:
--------------------------------------------------------------------------------
  1 | cpu: Intel Lunar Lake P-core Lion Cove CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor
  4 | 
  5 |     # Source: David Huang
  6 |     l1ic: 64KB L1 IC
  7 | 
  8 |     bp -> l1ic
  9 | 
 10 |     iq: Instruction Queue
 11 |     l1ic -> iq
 12 | 
 13 |     # Source: Intel
 14 |     # "DECODE (8-WIDE)"
 15 |     decode: 8-way Decode
 16 |     iq -> decode
 17 | 
 18 |     uopc: UOP Cache
 19 |     decode -> uopc
 20 |     bp -> uopc
 21 | 
 22 |     # Source: Intel
 23 |     # "uOP CACHE (12-WIDE)"
 24 |     uop: UOP Queue
 25 |     uopc -> uop: 12 UOP/cycle
 26 |     decode -> uop: 8 UOP/cycle
 27 | 
 28 |     # Source: Intel
 29 |     # "6 -> 8 wide alloc/rename"
 30 |     rename: 8-way Rename {
 31 |       Move Elimination
 32 |       Zero Idiom
 33 |     }
 34 |     uop -> rename
 35 |   }
 36 | 
 37 |   backend: Backend {
 38 |     # Source: Intel
 39 |     # "8 -> 12 wide retirement" and "512 -> 576 deep instruction window" in
 40 |     rob: 576-entry ROB, 12 wide retirement
 41 | 
 42 |     rf: Register File {
 43 |       irf: Integer Register File
 44 | 
 45 |       flagsrf: Flags Register File
 46 | 
 47 |       vrf: FP/Vector Register File
 48 |     }
 49 | 
 50 |     sched1: Integer Scheduler
 51 | 
 52 |     # Source: Intel
 53 |     pipe1: Port 0 {
 54 |       grid-columns: 1
 55 |       ALU
 56 |       JMP
 57 |     }
 58 |     rob -> sched1 -> rf -> pipe1
 59 | 
 60 |     # Source: Intel
 61 |     pipe2: Port 1 {
 62 |       grid-columns: 1
 63 |       ALU
 64 |       SHIFT
 65 |       MUL
 66 |     }
 67 |     rob -> sched1 -> rf -> pipe2
 68 | 
 69 |     # Source: Intel
 70 |     pipe3: Port 2 {
 71 |       grid-columns: 1
 72 |       ALU
 73 |       JMP
 74 |     }
 75 |     rob -> sched1 -> rf -> pipe3
 76 | 
 77 |     # Source: Intel
 78 |     pipe4: Port 3 {
 79 |       grid-columns: 1
 80 |       ALU
 81 |       SHIFT
 82 |       MUL
 83 |     }
 84 |     rob -> sched1 -> rf -> pipe4
 85 | 
 86 |     # Source: Intel
 87 |     pipe5: Port 4 {
 88 |       grid-columns: 1
 89 |       ALU
 90 |       JMP
 91 |     }
 92 |     rob -> sched1 -> rf -> pipe5
 93 | 
 94 |     # Source: Intel
 95 |     pipe6: Port 5 {
 96 |       grid-columns: 1
 97 |       ALU
 98 |       SHIFT
 99 |       MUL
100 |     }
101 |     rob -> sched1 -> rf -> pipe6
102 | 
103 |     sched2: Memory Scheduler
104 | 
105 |     # Source: Intel
106 |     pipe7: Port 20 {
107 |       Load AGU
108 |     }
109 |     rob -> sched2 -> rf -> pipe7
110 | 
111 |     # Source: Intel
112 |     pipe8: Port 25 {
113 |       Store AGU
114 |     }
115 |     rob -> sched2 -> rf -> pipe8
116 | 
117 |     # Source: Intel
118 |     pipe9: Port 21 {
119 |       Load AGU
120 |     }
121 |     rob -> sched2 -> rf -> pipe9
122 | 
123 |     # Source: Intel
124 |     pipe10: Port 26 {
125 |       Store AGU
126 |     }
127 |     rob -> sched2 -> rf -> pipe10
128 | 
129 |     # Source: Intel
130 |     pipe11: Port 22 {
131 |       Load AGU
132 |     }
133 |     rob -> sched2 -> rf -> pipe11
134 | 
135 |     # Source: Intel
136 |     pipe12: Port 27 {
137 |       Store AGU
138 |     }
139 |     rob -> sched3 -> rf -> pipe12
140 | 
141 |     sched3: Store Data Scheduler
142 | 
143 |     # Source: Intel
144 |     pipe13: Port 10 {
145 |       Store Data
146 |     }
147 |     rob -> sched3 -> rf -> pipe13
148 | 
149 |     # Source: Intel
150 |     pipe14: Port 11 {
151 |       Store Data
152 |     }
153 |     rob -> sched3 -> rf -> pipe14
154 | 
155 |     lsu: LSU {
156 |       lq: Load Queue
157 |       sq: Store Queue
158 |     }
159 | 
160 |     pipe7 -> lsu
161 |     pipe8 -> lsu
162 |     pipe9 -> lsu
163 |     pipe10 -> lsu
164 |     pipe11 -> lsu
165 |     pipe12 -> lsu
166 |     pipe13 -> lsu
167 |     pipe14 -> lsu
168 | 
169 |     sched4: Vector Scheduler
170 | 
171 |     # Source: Intel
172 |     pipe15: Port V0 {
173 |       grid-columns: 1
174 |       FMA
175 |       ALU
176 |       SHIFT
177 |     }
178 |     rob -> sched4 -> rf -> pipe15
179 | 
180 |     # Source: Intel
181 |     pipe16: Port V1 {
182 |       grid-columns: 1
183 |       FADD
184 |       ALU
185 |       SHUF
186 |       FPDIV
187 |     }
188 |     rob -> sched4 -> rf -> pipe16
189 | 
190 |     # Source: Intel
191 |     pipe17: Port V2 {
192 |       grid-columns: 1
193 |       FMA
194 |       ALU
195 |       SHIFT
196 |     }
197 |     rob -> sched4 -> rf -> pipe17
198 | 
199 |     # Source: Intel
200 |     pipe18: Port V3 {
201 |       grid-columns: 1
202 |       FADD
203 |       ALU
204 |       SHUF
205 |       FPDIV
206 |     }
207 |     rob -> sched4 -> rf -> pipe18
208 |   }
209 |   frontend.rename -> backend.rob
210 | 
211 |   mem: Memory {
212 |     l0: L0 DC {
213 |       # Source: Intel
214 |       # "96 -> 128 pages DTLB"
215 |       l1dtlb: 128-entry L1 DTLB
216 | 
217 |       # Source: Intel
218 |       l0dc: 48KB L0 DC
219 |       4 cycle load to use latency
220 |       3x256b or 2x512b read per cycle
221 |     }
222 | 
223 |     l1: L1 DC {
224 |       # Source: Intel
225 |       192KB L1 DC
226 |       9 cycle load to use latency
227 |       2x64B read per cycle
228 |     }
229 | 
230 |     l0 -> l1
231 | 
232 |     l2: L2 Cache {
233 |       # Source: Intel
234 |       l2dc: 2.5MB/3MB L2 Cache
235 |       17 cycle load to use latency
236 |       2x64B read per cycle
237 |     }
238 | 
239 |     l1 -> l2
240 |   }
241 |   frontend.l1ic -> mem.l2
242 |   backend.lsu -> mem.l0
243 | 
244 |   info: |md
245 |     Drawn by Jiajie Chen @jiegec
246 | 
247 |     Based on data from Chips and Cheese, Intel, David Huang and Anandtech
248 |   |
249 | }
250 | 


--------------------------------------------------------------------------------
/docs/lion_cove.md:
--------------------------------------------------------------------------------
 1 | # Intel Lunar Lake P-core aka Lion Cove
 2 | 
 3 | ![](./lion_cove.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [An Interview with Intel’s Arik Gihon about Lunar Lake at Hot Chips 2024](https://chipsandcheese.com/2024/09/02/an-interview-with-intels-arik-gihon-about-lunar-lake-at-hot-chips-2024/)
 8 | - [Intel’s Lion Cove Architecture Preview](https://chipsandcheese.com/2024/06/03/intels-lion-cove-architecture-preview/)
 9 | - [2024 Intel Tech Tour: Next Gen P-core-The Lion Cove Microarchitecture](https://www.intel.com/content/www/us/en/content-details/824430/2024-intel-tech-tour-next-gen-p-core-the-lion-cove-microarchitecture.html)
10 | - [2024 Intel Tech Tour: LNL Architecture Session Highlights](https://www.intel.com/content/www/us/en/content-details/824443/2024-intel-tech-tour-lnl-architecture-session-highlights.html)
11 | - [2024 Intel Technology Tour Keynote](https://www.intel.com/content/www/us/en/content-details/824444/2024-intel-technology-tour-keynote.html)
12 | - [月光下的新探索：Lunar Lake CPU (Lion Cove / Skymont) 微架构测试](https://blog.hjc.im/lunar-lake-cpu-uarch-review.html)
13 | - [Lion Cove: Intel’s P-Core Roars](https://chipsandcheese.com/2024/09/27/lion-cove-intels-p-core-roars/)
14 | - [Intel Announces Core Ultra 200S Arrow Lake CPUs](https://www.phoronix.com/review/intel-core-ultra-200-arrow-lake)
15 | - [Intel® Core Ultra Desktop Processors Launch Briefing](https://download.intel.com/newsroom/2024/client-computing/Intel-Core-Ultra-200S-Series-Presentation.pdf)
16 | 


--------------------------------------------------------------------------------
/docs/m3_pcore.d2:
--------------------------------------------------------------------------------
  1 | cpu : Apple M3 P-core CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |     }
  5 | 
  6 |     # Coupled Frontend
  7 |     l1ic: L1 IC {
  8 |       # Source: Geekerwan
  9 |       l1ic: 192KB L1 IC
 10 |     }
 11 | 
 12 |     bp -> l1ic
 13 | 
 14 |     # Source: Geekerwan
 15 |     decode: 9-way Decode
 16 |     l1ic -> decode
 17 |     decode -> bp
 18 | 
 19 |     # Source: Geekerwan
 20 |     rename: 9-way Rename
 21 |     decode -> rename
 22 |   }
 23 | 
 24 |   backend: Backend {
 25 |     # Source: Geekerwan
 26 |     rob: 321-entry Coalesced ROB
 27 | 
 28 |     rf: Register File {
 29 |       # Source: Geekerwan
 30 |       irf: ~368-entry Integer Register File
 31 | 
 32 |       # Source: Geekerwan
 33 |       vrf: ~423-entry 128b Vector Register File
 34 |     }
 35 | 
 36 |     # Source: Geekerwan
 37 |     dispatch1: 12-entry Dispatch Queue \#1
 38 | 
 39 |     # Source: Geekerwan
 40 |     sched1: 78-entry Scheduler \#1
 41 | 
 42 |     # Source: Geekerwan
 43 |     pipe1: Pipe \#1 {
 44 |       ALU
 45 |       BR
 46 |       FLAGS
 47 |       ADR
 48 |     }
 49 |     dispatch1 -> sched1 -> rf.irf -> pipe1
 50 | 
 51 |     # Source: Geekerwan
 52 |     pipe2: Pipe \#2 {
 53 |       ALU
 54 |       BR
 55 |       FLAGS
 56 |       ADR
 57 |     }
 58 |     dispatch1 -> sched1 -> rf.irf -> pipe2
 59 | 
 60 |     # Source: Geekerwan
 61 |     pipe3: Pipe \#3 {
 62 |       ALU
 63 |       FLAGS
 64 |       ADR
 65 |     }
 66 |     dispatch1 -> sched1 -> rf.irf -> pipe3
 67 | 
 68 |     # Source: Geekerwan
 69 |     pipe4: Pipe \#4 {
 70 |       ALU
 71 |       FLAGS
 72 |       ADR
 73 |     }
 74 |     dispatch1 -> sched1 -> rf.irf -> pipe4
 75 | 
 76 |     # Source: Geekerwan
 77 |     dispatch2: 12-entry Dispatch Queue \#2
 78 | 
 79 |     # Source: Geekerwan
 80 |     sched2: 36-entry Scheduler \#2
 81 | 
 82 |     # Source: Geekerwan
 83 |     pipe5: Pipe \#5 {
 84 |       ALU
 85 |       ADR
 86 |     }
 87 |     dispatch2 -> sched2 -> rf.irf -> pipe5
 88 | 
 89 |     # Source: Geekerwan
 90 |     pipe6: Pipe \#6 {
 91 |       ALU
 92 |     }
 93 |     dispatch2 -> sched2 -> rf.irf -> pipe6
 94 | 
 95 |     # Source: Geekerwan
 96 |     sched3: 26-entry Scheduler \#3
 97 | 
 98 |     # Source: Geekerwan
 99 |     pipe7: Pipe \#7 {
100 |       ALU
101 |       MUL
102 |       DIV
103 |     }
104 |     dispatch2 -> sched3 -> rf.irf -> pipe7
105 | 
106 |     # Source: Geekerwan
107 |     sched4: 26-entry Scheduler \#4
108 | 
109 |     # Source: Geekerwan
110 |     pipe8: Pipe \#8 {
111 |       ALU
112 |       MUL
113 |       BFM
114 |       MADD
115 |     }
116 |     dispatch2 -> sched4 -> rf.irf -> pipe8
117 | 
118 |     # Source: Geekerwan
119 |     dispatch3: 10-entry Dispatch Queue \#3
120 | 
121 |     # Source: Geekerwan
122 |     sched5: 60-entry Scheduler \#5
123 | 
124 |     # Source: Geekerwan
125 |     pipe9: Pipe \#9 {
126 |       STORE
127 |     }
128 |     dispatch3 -> sched5 -> rf.irf -> pipe9
129 | 
130 |     # Source: Geekerwan
131 |     pipe10: Pipe \#10 {
132 |       LOAD
133 |       STORE
134 |     }
135 |     dispatch3 -> sched5 -> rf.irf -> pipe10
136 | 
137 |     # Source: Geekerwan
138 |     pipe11: Pipe \#11 {
139 |       LOAD
140 |     }
141 |     dispatch3 -> sched5 -> rf.irf -> pipe11
142 | 
143 |     # Source: Geekerwan
144 |     pipe12: Pipe \#12 {
145 |       LOAD
146 |     }
147 |     dispatch3 -> sched5 -> rf.irf -> pipe12
148 | 
149 |     lsu: LSU {
150 |       # Source: Geekerwan
151 |       142-entry Load Queue
152 |       63-entry Store Queue
153 |     }
154 | 
155 |     pipe9 -> lsu
156 |     pipe10 -> lsu
157 |     pipe11 -> lsu
158 |     pipe12 -> lsu
159 | 
160 |     rob -> dispatch1
161 |     rob -> dispatch2
162 |     rob -> dispatch3
163 | 
164 |     # Source: Geekerwan
165 |     dispatch4: 12-entry Dispatch Queue \#4
166 | 
167 |     # Source: Geekerwan
168 |     sched6: 41-entry Scheduler \#6
169 | 
170 |     # Source: Geekerwan
171 |     pipe13: Pipe \#13 {
172 |       FP
173 |       SIMD
174 |     }
175 |     dispatch4 -> sched6 -> rf.vrf -> pipe13
176 | 
177 |     # Source: Geekerwan
178 |     sched7: 41-entry Scheduler \#7
179 | 
180 |     # Source: Geekerwan
181 |     pipe14: Pipe \#14 {
182 |       FP
183 |       SIMD
184 |     }
185 |     dispatch4 -> sched7 -> rf.vrf -> pipe14
186 | 
187 |     # Source: Geekerwan
188 |     sched8: 41-entry Scheduler \#8
189 | 
190 |     # Source: Geekerwan
191 |     pipe15: Pipe \#15 {
192 |       FP
193 |       SIMD
194 |       TO INT
195 |     }
196 |     dispatch4 -> sched8 -> rf.vrf -> pipe15
197 | 
198 |     # Source: Geekerwan
199 |     sched9: 41-entry Scheduler \#9
200 | 
201 |     # Source: Geekerwan
202 |     pipe16: Pipe \#16 {
203 |       FP
204 |       SIMD
205 |       FSQRT
206 |       FCSEL
207 |       TO INT
208 |     }
209 |     dispatch4 -> sched9 -> rf.vrf -> pipe16
210 |     rob -> dispatch4
211 |   }
212 |   frontend.rename -> backend.rob
213 | 
214 |   mem: Memory {
215 |     l1: L1 DC {
216 |       # Source: Geekerwan
217 |       l1dc: 128KB L1DC
218 |     }
219 |   }
220 |   backend.lsu -> mem.l1
221 | 
222 |   info: |md
223 |     Drawn by Jiajie Chen @jiegec
224 | 
225 |     Based on data from Geekerwan
226 |   |
227 | }


--------------------------------------------------------------------------------
/docs/m3_pcore.md:
--------------------------------------------------------------------------------
1 | # Apple M3 P-core
2 | 
3 | ![](./m3_pcore.svg)
4 | 
5 | References:
6 | 
7 | - [苹果 M4 性能分析：尽力了，但芯片工艺快到头了！](https://www.bilibili.com/video/BV1NJ4m1w7zk/)
8 | - [MacBook Air M3 简评：性能、续航如何？M1 用户要换吗？](https://www.bilibili.com/video/BV1cw4m1o76r/)
9 | 


--------------------------------------------------------------------------------
/docs/m4_pcore.d2:
--------------------------------------------------------------------------------
  1 | cpu : Apple M4 P-core CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |     }
  5 | 
  6 |     # Coupled Frontend
  7 |     l1ic: L1 IC {
  8 |       # Source: Geekerwan
  9 |       l1ic: 192KB L1 IC
 10 |     }
 11 | 
 12 |     bp -> l1ic
 13 | 
 14 |     # Source: Geekerwan
 15 |     decode: 10-way Decode
 16 |     l1ic -> decode
 17 |     decode -> bp
 18 | 
 19 |     # Source: Geekerwan
 20 |     rename: 10-way Rename
 21 |     decode -> rename
 22 |   }
 23 | 
 24 |   backend: Backend {
 25 |     # Source: Geekerwan
 26 |     rob: 361-entry Coalesced ROB
 27 | 
 28 |     rf: Register File {
 29 |       # Source: Geekerwan
 30 |       irf: ~446-entry Integer Register File
 31 | 
 32 |       # Source: Geekerwan
 33 |       vrf: ~378-entry 128b Vector Register File
 34 |     }
 35 | 
 36 |     # Source: Geekerwan
 37 |     dispatch1: 15-entry Dispatch Queue \#1
 38 | 
 39 |     # Source: Geekerwan
 40 |     sched1: 87-entry Scheduler \#1
 41 | 
 42 |     # Source: Geekerwan
 43 |     pipe1: Pipe \#1 {
 44 |       ALU
 45 |       BR
 46 |       FLAGS
 47 |       ADR
 48 |     }
 49 |     dispatch1 -> sched1 -> rf.irf -> pipe1
 50 | 
 51 |     # Source: Geekerwan
 52 |     pipe2: Pipe \#2 {
 53 |       ALU
 54 |       BR
 55 |       FLAGS
 56 |       ADR
 57 |     }
 58 |     dispatch1 -> sched1 -> rf.irf -> pipe2
 59 | 
 60 |     # Source: Geekerwan
 61 |     pipe3: Pipe \#3 {
 62 |       ALU
 63 |       FLAGS
 64 |       ADR
 65 |     }
 66 |     dispatch1 -> sched1 -> rf.irf -> pipe3
 67 | 
 68 |     # Source: Geekerwan
 69 |     pipe4: Pipe \#4 {
 70 |       ALU
 71 |       FLAGS
 72 |       ADR
 73 |     }
 74 |     dispatch1 -> sched1 -> rf.irf -> pipe4
 75 | 
 76 |     # Source: Geekerwan
 77 |     dispatch2: 15-entry Dispatch Queue \#2
 78 | 
 79 |     # Source: Geekerwan
 80 |     sched2: 19-entry Scheduler \#2
 81 | 
 82 |     # Source: Geekerwan
 83 |     pipe5: Pipe \#5 {
 84 |       ALU
 85 |       ADR
 86 |     }
 87 |     dispatch2 -> sched2 -> rf.irf -> pipe5
 88 | 
 89 |     # Source: Geekerwan
 90 |     sched3: 29-entry Scheduler \#3
 91 | 
 92 |     # Source: Geekerwan
 93 |     pipe6: Pipe \#6 {
 94 |       ALU
 95 |     }
 96 |     dispatch2 -> sched3 -> rf.irf -> pipe6
 97 | 
 98 |     # Source: Geekerwan
 99 |     sched4: 17-entry Scheduler \#4
100 | 
101 |     # Source: Geekerwan
102 |     pipe7: Pipe \#7 {
103 |       ALU
104 |       MUL
105 |       DIV
106 |     }
107 |     dispatch2 -> sched4 -> rf.irf -> pipe7
108 | 
109 |     # Source: Geekerwan
110 |     sched5: 23-entry Scheduler \#5
111 | 
112 |     # Source: Geekerwan
113 |     pipe8: Pipe \#8 {
114 |       ALU
115 |       MUL
116 |       BFM
117 |       MADD
118 |     }
119 |     dispatch2 -> sched5 -> rf.irf -> pipe8
120 | 
121 |     # Source: Geekerwan
122 |     dispatch3: 15-entry Dispatch Queue \#3
123 | 
124 |     # Source: Geekerwan
125 |     sched6: 72-entry Scheduler \#6
126 | 
127 |     # Source: Geekerwan
128 |     pipe9: Pipe \#9 {
129 |       STORE
130 |     }
131 |     dispatch3 -> sched6 -> rf.irf -> pipe9
132 | 
133 |     # Source: Geekerwan
134 |     pipe10: Pipe \#10 {
135 |       LOAD
136 |       STORE
137 |     }
138 |     dispatch3 -> sched6 -> rf.irf -> pipe10
139 | 
140 |     # Source: Geekerwan
141 |     pipe11: Pipe \#11 {
142 |       LOAD
143 |     }
144 |     dispatch3 -> sched6 -> rf.irf -> pipe11
145 | 
146 |     # Source: Geekerwan
147 |     pipe12: Pipe \#12 {
148 |       LOAD
149 |     }
150 |     dispatch3 -> sched6 -> rf.irf -> pipe12
151 | 
152 |     lsu: LSU {
153 |       # Source: Geekerwan
154 |       131-entry Load Queue
155 |       72-entry Store Queue
156 |     }
157 | 
158 |     pipe9 -> lsu
159 |     pipe10 -> lsu
160 |     pipe11 -> lsu
161 |     pipe12 -> lsu
162 | 
163 |     rob -> dispatch1
164 |     rob -> dispatch2
165 |     rob -> dispatch3
166 | 
167 |     # Source: Geekerwan
168 |     dispatch4: 23-entry Dispatch Queue \#4
169 | 
170 |     # Source: Geekerwan
171 |     sched7: 61-entry Scheduler \#7
172 | 
173 |     # Source: Geekerwan
174 |     pipe13: Pipe \#13 {
175 |       FP
176 |       SIMD
177 |     }
178 |     dispatch4 -> sched7 -> rf.vrf -> pipe13
179 | 
180 |     # Source: Geekerwan
181 |     sched8: 61-entry Scheduler \#8
182 | 
183 |     # Source: Geekerwan
184 |     pipe14: Pipe \#14 {
185 |       FP
186 |       SIMD
187 |     }
188 |     dispatch4 -> sched8 -> rf.vrf -> pipe14
189 | 
190 |     # Source: Geekerwan
191 |     sched9: 61-entry Scheduler \#9
192 | 
193 |     # Source: Geekerwan
194 |     pipe15: Pipe \#15 {
195 |       FP
196 |       SIMD
197 |       FCSEL
198 |       TO INT
199 |     }
200 |     dispatch4 -> sched9 -> rf.vrf -> pipe15
201 | 
202 |     # Source: Geekerwan
203 |     sched10: 61-entry Scheduler \#10
204 | 
205 |     # Source: Geekerwan
206 |     pipe16: Pipe \#16 {
207 |       FP
208 |       SIMD
209 |       DIV
210 |       FSQRT
211 |       FCSEL
212 |       TO INT
213 |     }
214 |     dispatch4 -> sched10 -> rf.vrf -> pipe16
215 |     rob -> dispatch4
216 |   }
217 |   frontend.rename -> backend.rob
218 | 
219 |   mem: Memory {
220 |     l1: L1 DC {
221 |       # Source: Geekerwan
222 |       l1dc: 128KB L1DC
223 |     }
224 |   }
225 |   backend.lsu -> mem.l1
226 | 
227 |   info: |md
228 |     Drawn by Jiajie Chen @jiegec
229 | 
230 |     Based on data from Geekerwan
231 |   |
232 | }


--------------------------------------------------------------------------------
/docs/m4_pcore.md:
--------------------------------------------------------------------------------
1 | # Apple M4 P-core
2 | 
3 | ![](./m4_pcore.svg)
4 | 
5 | References:
6 | 
7 | - [苹果 M4 性能分析：尽力了，但芯片工艺快到头了！](https://www.bilibili.com/video/BV1NJ4m1w7zk/)
8 | - [iPhone 16系列性能分析：A18兄弟挺强的！](https://www.bilibili.com/video/BV178tEeVEMD/)
9 | 


--------------------------------------------------------------------------------
/docs/main.py:
--------------------------------------------------------------------------------
  1 | import pandas
  2 | import math
  3 | 
  4 | all_data = pandas.read_csv("docs/uarch.csv")
  5 | 
  6 | 
  7 | def define_env(env):
  8 |     @env.macro
  9 |     def bp_comparison():
 10 |         data = all_data
 11 |         # filter columns
 12 |         data = data[
 13 |             [
 14 |                 "uArch",
 15 |                 "L1 BTB",
 16 |                 "L2 BTB",
 17 |                 "L3 BTB",
 18 |                 "ITA",
 19 |                 "RAS",
 20 |             ]
 21 |         ]
 22 |         # drop integer index
 23 |         data = data.set_index("uArch")
 24 |         # handle empty fields
 25 |         data = data.fillna("")
 26 |         # convert to integer
 27 |         fields = ["ITA"]
 28 |         for index, row in data.iterrows():
 29 |             for field in fields:
 30 |                 if row[field] != "":
 31 |                     data.loc[index, field] = str(int(row[field]))
 32 |         return data.to_markdown()
 33 | 
 34 |     @env.macro
 35 |     def l1ic_comparison():
 36 |         data = all_data
 37 |         # filter columns
 38 |         data = data[
 39 |             [
 40 |                 "uArch",
 41 |                 "L1 IC",
 42 |                 "L1 ITLB",
 43 |                 "L2 ITLB",
 44 |             ]
 45 |         ]
 46 |         # drop integer index
 47 |         data = data.set_index("uArch")
 48 |         # handle empty fields
 49 |         data = data.fillna("")
 50 |         # convert to integer
 51 |         fields = ["L1 ITLB", "L2 ITLB"]
 52 |         for index, row in data.iterrows():
 53 |             for field in fields:
 54 |                 if row[field] != "":
 55 |                     data.loc[index, field] = str(int(row[field]))
 56 |         return data.to_markdown()
 57 | 
 58 |     @env.macro
 59 |     def rob_comparison():
 60 |         data = all_data
 61 |         # filter columns
 62 |         data = data[
 63 |             [
 64 |                 "uArch",
 65 |                 "ROB",
 66 |             ]
 67 |         ]
 68 |         # drop integer index
 69 |         data = data.set_index("uArch")
 70 |         return data.to_markdown()
 71 | 
 72 |     @env.macro
 73 |     def eu_comparison():
 74 |         data = all_data
 75 |         # filter columns
 76 |         data = data[
 77 |             [
 78 |                 "uArch",
 79 |                 "ALU units",
 80 |                 "Branch units",
 81 |                 "FP/Vec units",
 82 |             ]
 83 |         ]
 84 |         # drop integer index
 85 |         data = data.set_index("uArch")
 86 |         return data.to_markdown()
 87 | 
 88 |     @env.macro
 89 |     def cortex_x_comparison():
 90 |         data = all_data
 91 |         # only consider cortex x cores
 92 |         data = data[data["uArch"].str.startswith("ARM Cortex-X")]
 93 |         # filter columns
 94 |         data = data[
 95 |             [
 96 |                 "uArch",
 97 |                 "ALU units",
 98 |                 "Branch units",
 99 |                 "Load/Store pipes",
100 |                 "Load-only pipes",
101 |                 "Store-only pipes",
102 |                 "ROB",
103 |                 "Decode width",
104 |                 "Rename width",
105 |             ]
106 |         ]
107 |         data["Max Load"] = data["Load/Store pipes"].astype(int) + data["Load-only pipes"].astype(int)
108 |         data["Max Store"] = data["Load/Store pipes"].astype(int) + data["Store-only pipes"].astype(int)
109 |         data["Max Load+Store"] = (
110 |             data["Load/Store pipes"].astype(int)
111 |             + data["Load-only pipes"].astype(int)
112 |             + data["Store-only pipes"].astype(int)
113 |         )
114 | 
115 |         # reduce column width
116 |         for index, row in data.iterrows():
117 |             data.loc[index, "uArch"] = row["uArch"].removeprefix("ARM ")
118 |         # drop integer index
119 |         data = data.set_index("uArch")
120 |         data = data.transpose()
121 |         # compute maximum load
122 |         return data.to_markdown()
123 | 
124 |     @env.macro
125 |     def firestorm_oryon_comparison():
126 |         data = all_data
127 |         # only consider firestorm/oryon
128 |         data = data[data["uArch"].isin(["Apple Firestorm", "Qualcomm Oryon"])]
129 |         # filter columns
130 |         data = data[
131 |             [
132 |                 "uArch",
133 |                 "L1 BTB",
134 |                 "L2 BTB",
135 |                 "RAS",
136 |                 "L1 IC",
137 |                 "Decode width",
138 |                 "Rename width",
139 |                 "ROB",
140 |                 "Branch units",
141 |                 "ALU units",
142 |                 "FP/Vec units",
143 |                 "Load/Store pipes",
144 |                 "Load-only pipes",
145 |                 "Store-only pipes",
146 |             ]
147 |         ]
148 |         # drop integer index
149 |         data = data.set_index("uArch")
150 |         fields = ["L1 BTB", "Rename width"]
151 |         for index, row in data.iterrows():
152 |             for field in fields:
153 |                 if row[field] != "":
154 |                     data.loc[index, field] = str(row[field])
155 |         data = data.transpose()
156 |         return data.to_markdown()
157 | 


--------------------------------------------------------------------------------
/docs/neoverse_n2.md:
--------------------------------------------------------------------------------
1 | # ARM Neoverse N2
2 | 
3 | ![](./neoverse_n2.svg)
4 | 
5 | References:
6 | 
7 | - [Arm Neoverse N2: Arm’s 2nd generation high performance infrastructure CPUs and system IPs](https://hc33.hotchips.org/assets/program/conference/day1/20210818_Hotchips_NeoverseN2.pdf)
8 | 


--------------------------------------------------------------------------------
/docs/neoverse_v2.d2:
--------------------------------------------------------------------------------
  1 | cpu : ARM Neoverse-V2 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: ARM
  5 |       two predicted branches per cycle
  6 |       # "Compared to Neoverse V1: 10x larger nano BTB, split main BTB into two
  7 |       # levels with 50% more entries"
  8 |       two-level branch target buffer
  9 |       # "Compared to Neoverse V1: 2x larger table with 2-way associativity and
 10 |       # longer history"
 11 |       2-way 8 table TAGE direction predictor with staged output
 12 |     }
 13 | 
 14 |     l1ic: L1 IC {
 15 |       # Source: ARM
 16 |       # "64kB, 4-way set-associative L1 instruction cache"
 17 |       l1ic: 64KB 4-way L1 IC
 18 |     }
 19 | 
 20 |     # Source: ARM
 21 |     # "Doubled from 16 to 32 entries"
 22 |     fq: 32-entry Fetch Queue
 23 |     bp -> fq
 24 |     fq -> l1ic
 25 | 
 26 |     # Source: ARM
 27 |     # "Increased Decode Queue from 16 to 24 entries"
 28 |     iq: 24-entry Instruction Queue
 29 |     l1ic -> iq
 30 | 
 31 |     # Source: ARM
 32 |     # "Increased decoder lanes from 5 to 6"
 33 |     decode: 6-way Decode
 34 |     iq -> decode
 35 | 
 36 |     # Source: ARM
 37 |     mopc: MOP Cache
 38 |     decode -> mopc
 39 |     bp -> mopc
 40 | 
 41 |     mop: MOP Queue
 42 | 
 43 |     # Source: ARM
 44 |     mopc -> mop: 8 MOP/cycle
 45 |     decode -> mop: 6 MOP/cycle
 46 | 
 47 |     # Source: ARM
 48 |     rename: 8-way Rename {
 49 |       Zero Idiom
 50 |     }
 51 |     mop -> rename
 52 |   }
 53 | 
 54 |   backend: Backend {
 55 |     rob: ROB
 56 | 
 57 |     rf: Register File {
 58 |       irf: Integer Register File
 59 | 
 60 |       flagsrf: Flags Register File
 61 | 
 62 |       vrf: FP/Vector Register File
 63 |     }
 64 | 
 65 |     # Source: ARM
 66 |     # "SX/MX: Increased from 20 to 22 entries"
 67 |     sched1: 22-entry ALU Scheduler
 68 | 
 69 |     # Source: ARM
 70 |     pipe1: Pipe \#1 {
 71 |       ALU
 72 |     }
 73 |     rob -> sched1 -> rf -> pipe1
 74 | 
 75 |     # Source: ARM
 76 |     pipe2: Pipe \#2 {
 77 |       Branch
 78 |     }
 79 |     rob -> sched1 -> rf -> pipe2
 80 | 
 81 |     # Source: ARM
 82 |     # "SX/MX: Increased from 20 to 22 entries"
 83 |     sched2: 22-entry ALU Scheduler
 84 | 
 85 |     # Source: ARM
 86 |     pipe3: Pipe \#3 {
 87 |       ALU
 88 |     }
 89 |     rob -> sched2 -> rf -> pipe3
 90 | 
 91 |     # Source: ARM
 92 |     pipe4: Pipe \#4 {
 93 |       Branch
 94 |     }
 95 |     rob -> sched2 -> rf -> pipe4
 96 | 
 97 |     # Source: ARM
 98 |     # "SX/MX: Increased from 20 to 22 entries"
 99 |     sched3: 22-entry ALU Scheduler
100 | 
101 |     # Source: ARM
102 |     pipe5: Pipe \#5 {
103 |       ALU
104 |       SHIFT + ALU
105 |       MUL/IMAC/DIV/CRC/SPR
106 |     }
107 |     rob -> sched3 -> rf -> pipe5
108 | 
109 |     # Source: ARM
110 |     # "SX/MX: Increased from 20 to 22 entries"
111 |     sched4: 22-entry ALU Scheduler
112 | 
113 |     # Source: ARM
114 |     pipe6: Pipe \#6 {
115 |       ALU
116 |       SHIFT + ALU
117 |       MUL/IMAC
118 |     }
119 |     rob -> sched4 -> rf -> pipe6
120 | 
121 |     # Source: ARM
122 |     # "SX/MX: Increased from 20 to 22 entries"
123 |     sched5: 22-entry ALU Scheduler
124 | 
125 |     # Source: ARM
126 |     pipe7: Pipe \#7 {
127 |       ALU
128 |     }
129 |     rob -> sched5 -> rf -> pipe7
130 | 
131 |     # Source: ARM
132 |     # "SX/MX: Increased from 20 to 22 entries"
133 |     sched6: 22-entry ALU Scheduler
134 | 
135 |     # Source: ARM
136 |     pipe8: Pipe \#8 {
137 |       ALU
138 |     }
139 |     rob -> sched6 -> rf -> pipe8
140 | 
141 |     # Source: ARM
142 |     # "VX: Increased from 20 to 28 entries"
143 |     sched7: 28-entry FP/Vector Scheduler
144 | 
145 |     # Source: ARM
146 |     pipe9: Pipe \#9 {
147 |       FP/Vector
148 |     }
149 |     rob -> sched7 -> rf -> pipe9
150 | 
151 |     # Source: ARM
152 |     pipe10: Pipe \#10 {
153 |       FP/Vector
154 |     }
155 |     rob -> sched7 -> rf -> pipe10
156 | 
157 |     # Source: ARM
158 |     # "VX: Increased from 20 to 28 entries"
159 |     sched8: 28-entry FP/Vector Scheduler
160 | 
161 |     # Source: ARM
162 |     pipe11: Pipe \#11 {
163 |       FP/Vector
164 |     }
165 |     rob -> sched8 -> rf -> pipe11
166 | 
167 |     # Source: ARM
168 |     pipe12: Pipe \#12 {
169 |       FP/Vector
170 |     }
171 |     rob -> sched8 -> rf -> pipe12
172 | 
173 |     # Source: ARM
174 |     sched9: Memory Scheduler
175 | 
176 |     # Source: ARM
177 |     pipe13: Pipe \#13 {
178 |       Load/Store
179 |     }
180 |     rob -> sched9 -> rf -> pipe13
181 | 
182 |     # Source: ARM
183 |     sched10: Memory Scheduler
184 | 
185 |     # Source: ARM
186 |     pipe14: Pipe \#14 {
187 |       Load/Store
188 |     }
189 |     rob -> sched10 -> rf -> pipe14
190 | 
191 |     # Source: ARM
192 |     sched11: Memory Scheduler
193 | 
194 |     # Source: ARM
195 |     pipe15: Pipe \#15 {
196 |       Load
197 |     }
198 |     rob -> sched11 -> rf -> pipe15
199 | 
200 |     lsu: LSU {
201 |       Load Queue
202 |       Store Queue
203 |     }
204 | 
205 |     pipe13 -> lsu
206 |     pipe14 -> lsu
207 |     pipe15 -> lsu
208 |   }
209 |   frontend.rename -> backend.rob
210 | 
211 |   mem: Memory {
212 |     l1: L1 DC {
213 |       # Source: ARM
214 |       # "64kB 4-way set associative Dcache"
215 |       l1dc: 64KB 4-way L1 DC
216 |     }
217 | 
218 |     l2: L2 {
219 |     }
220 | 
221 |     l1 -> l2
222 | 
223 |     l3: L3 {
224 |     }
225 |     l2 -> l3
226 |   }
227 |   frontend.l1ic -> mem.l2
228 |   backend.lsu -> mem.l1
229 | 
230 |   info: |md
231 |     Drawn by Jiajie Chen @jiegec
232 | 
233 |     Based on data from ARM and Chips and Cheese
234 |   |
235 | }
236 | 


--------------------------------------------------------------------------------
/docs/neoverse_v2.md:
--------------------------------------------------------------------------------
1 | # ARM Neoverse V2
2 | 
3 | ![](./neoverse_v2.svg)
4 | 
5 | References:
6 | 
7 | - [Arm Neoverse V2 platform: Leadership Performance and Power Efficiency for Next-Generation Cloud Computing, ML and HPC Workloads](https://hc2023.hotchips.org/assets/program/conference/day1/CPU1/HC2023.Arm.MagnusBruce.v04.FINAL.pdf)
8 | - [Hot Chips 2023: Arm’s Neoverse V2](https://chipsandcheese.com/p/hot-chips-2023-arms-neoverse-v2)
9 | 


--------------------------------------------------------------------------------
/docs/oryon.d2:
--------------------------------------------------------------------------------
  1 | cpu : Qualcomm Oryon CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Chips and Cheese
  5 |       # Source: Qualcomm, 2Ke+
  6 |       l1btb: 2048-entry L1 BTB, 1 cycle latency
  7 | 
  8 |       # Source: Chips and Cheese
  9 |       l2btb: 192KB 6-way L1 IC as L2 BTB, 3 cycle latency
 10 | 
 11 |       # Source: Chips and Cheese
 12 |       ind: 2048-entry Indirect Target Buffer
 13 | 
 14 |       # Source: Qualcomm
 15 |       cond: 80KB Conditional Predictor
 16 | 
 17 |       # Source: Qualcomm
 18 |       indir: 40KB Indirect Predictor
 19 | 
 20 |       # Source: Chips and Cheese
 21 |       # Source: jiegec, 50
 22 |       # Source: Qualcomm, 50e
 23 |       ras: 50-entry RAS
 24 |     }
 25 | 
 26 |     # Decoupled Frontend
 27 |     fq: Fetch Queue
 28 |     bp -> fq
 29 | 
 30 |     l1ic: L1 IC {
 31 |       # Source: Chips and Cheese
 32 |       # Source: Qualcomm
 33 |       itlb: 256-entry 8-way ITLB, 1 cycle latency
 34 | 
 35 |       # Source: Chips and Cheese, Anandtech
 36 |       # Source: Qualcomm
 37 |       l1ic: 192KB 6-way L1 IC
 38 |     }
 39 | 
 40 |     fq -> l1ic
 41 | 
 42 |     iq: Instruction Queue
 43 |     # Source: Qualcomm
 44 |     # "Fetches up to 16 instructions per cycle"
 45 |     l1ic -> iq: 16 inst/cycle
 46 | 
 47 |     # Source: Chips and Cheese, Anandtech
 48 |     decode: 8-way Decode
 49 |     iq -> decode
 50 | 
 51 |     # Source: Chips and Cheese
 52 |     uop: UOP Queue
 53 |     decode -> uop
 54 | 
 55 |     # Source: Chips and Cheese
 56 |     rename: 8-way Rename
 57 |     uop -> rename
 58 |   }
 59 | 
 60 |   backend: Backend {
 61 |     # Source: Chips and Cheese
 62 |     # Source: Qualcomm, 650+
 63 |     rob: 680-entry ROB, retire 8 op/cycle
 64 | 
 65 |     # Source: jiegec
 66 |     bob: 119-entry ROB
 67 | 
 68 |     rf: Register File {
 69 |       # Source: Chips and Cheese
 70 |       # Source: Qualcomm, 400+
 71 |       irf: 384 + 32-entry Integer Register File
 72 | 
 73 |       # Source: Chips and Cheese
 74 |       # Source: Qualcomm, 400+
 75 |       vrf: 384 + 32-entry 128b Vector Register File
 76 | 
 77 |     }
 78 | 
 79 |     # Source: Chips and Cheese, Anandtech
 80 |     # Source: Qualcomm, each with a 20e queue
 81 |     sched1: 20-entry ALU Scheduler \#1
 82 | 
 83 |     # Source: Chips and Cheese
 84 |     pipe1: Pipe \#1 {
 85 |       ALU
 86 |       Direct Branch
 87 |       Indirect Branch
 88 |     }
 89 |     sched1 -> rf.irf -> pipe1
 90 | 
 91 |     # Source: Chips and Cheese
 92 |     sched2: 20-entry ALU Scheduler \#2
 93 | 
 94 |     # Source: Chips and Cheese
 95 |     pipe2: Pipe \#2 {
 96 |       ALU
 97 |       Direct Branch
 98 |     }
 99 |     sched2 -> rf.irf -> pipe2
100 | 
101 |     # Source: Chips and Cheese
102 |     sched3: 20-entry ALU Scheduler \#3
103 | 
104 |     # Source: Chips and Cheese
105 |     pipe3: Pipe \#3 {
106 |       ALU
107 |       Crypto
108 |       CRC
109 |     }
110 |     sched3 -> rf.irf -> pipe3
111 | 
112 |     # Source: Chips and Cheese
113 |     sched4: 20-entry ALU Scheduler \#4
114 | 
115 |     # Source: Chips and Cheese
116 |     pipe4: Pipe \#4 {
117 |       ALU
118 |     }
119 |     sched4 -> rf.irf -> pipe4
120 | 
121 |     # Source: Chips and Cheese
122 |     sched5: 20-entry ALU Scheduler \#5
123 | 
124 |     # Source: Chips and Cheese
125 |     pipe5: Pipe \#5 {
126 |       ALU
127 |       INT MUL
128 |       I2V
129 |     }
130 |     sched5 -> rf.irf -> pipe5
131 | 
132 |     # Source: Chips and Cheese
133 |     sched6: 20-entry ALU Scheduler \#6
134 | 
135 |     # Source: Chips and Cheese
136 |     pipe6: Pipe \#6 {
137 |       ALU
138 |       INT MUL
139 |       INT DIV
140 |       I2V
141 |     }
142 |     sched6 -> rf.irf -> pipe6
143 | 
144 |     # Source: Chips and Cheese, Anandtech
145 |     # Source: Qualcomm, each with a 16e queue
146 |     sched7: 16-entry Memory Scheduler \#7
147 | 
148 |     # Source: Chips and Cheese
149 |     pipe7: Pipe \#7 {
150 |       AGU
151 |       Load
152 |       Store
153 |     }
154 |     sched7 -> rf.irf -> pipe7
155 | 
156 |     # Source: Chips and Cheese
157 |     sched8: 16-entry Memory Scheduler \#8
158 | 
159 |     # Source: Chips and Cheese
160 |     pipe8: Pipe \#8 {
161 |       AGU
162 |       Load
163 |       Store
164 |     }
165 |     sched8 -> rf.irf -> pipe8
166 | 
167 |     # Source: Chips and Cheese
168 |     sched9: 16-entry Memory Scheduler \#9
169 | 
170 |     # Source: Chips and Cheese
171 |     pipe9: Pipe \#9 {
172 |       AGU
173 |       Load
174 |       Store
175 |     }
176 |     sched9 -> rf.irf -> pipe9
177 | 
178 |     # Source: Chips and Cheese
179 |     sched10: 16-entry Memory Scheduler \#10
180 | 
181 |     # Source: Chips and Cheese
182 |     pipe10: Pipe \#10 {
183 |       AGU
184 |       Load
185 |       Store
186 |     }
187 |     sched10 -> rf.irf -> pipe10
188 | 
189 |     lsu: LSU {
190 |       # Source: Chips and Cheese, Anandtech, Qualcomm
191 |       192-entry Load Queue
192 |       56-entry Store Queue
193 |       # Source: Chips and Cheese
194 |       2 Load Pipe
195 |       2 Load/Store Pipe
196 |       # Source: jiegec
197 |       4 cycle load to use latency
198 |       3 cycle load to use latency in pointer chasing
199 |     }
200 | 
201 |     pipe7 -> lsu
202 |     pipe8 -> lsu
203 |     pipe9 -> lsu
204 |     pipe10 -> lsu
205 | 
206 |     rob -> sched1
207 |     rob -> sched2
208 |     rob -> sched3
209 |     rob -> sched4
210 |     rob -> sched5
211 |     rob -> sched6
212 |     rob -> sched7
213 |     rob -> sched8
214 |     rob -> sched9
215 |     rob -> sched10
216 | 
217 |     # Source: Chips and Cheese, Anandtech
218 |     # Source: Qualcomm, each with a 48e queue
219 |     sched11: 48-entry FP Scheduler \#11
220 | 
221 |     # Source: Chips and Cheese
222 |     pipe11: Pipe \#11 {
223 |       128b ALU
224 |       NEON
225 |       V2I
226 |     }
227 |     sched11 -> rf.vrf -> pipe11
228 | 
229 |     # Source: Chips and Cheese
230 |     sched12: 48-entry FP Scheduler \#12
231 | 
232 |     # Source: Chips and Cheese
233 |     pipe12: Pipe \#12 {
234 |       128b ALU
235 |       NEON
236 |       V2I
237 |     }
238 |     sched12 -> rf.vrf -> pipe12
239 | 
240 |     # Source: Chips and Cheese
241 |     sched13: 48-entry FP Scheduler \#13
242 | 
243 |     # Source: Chips and Cheese
244 |     pipe13: Pipe \#13 {
245 |       128b ALU
246 |       NEON
247 |     }
248 |     sched13 -> rf.vrf -> pipe13
249 | 
250 |     # Source: Chips and Cheese
251 |     sched14: 48-entry FP Scheduler \#14
252 | 
253 |     # Source: Chips and Cheese
254 |     pipe14: Pipe \#14 {
255 |       128b ALU
256 |       NEON
257 |       FDIV
258 |       FSQRT
259 |     }
260 |     sched14 -> rf.vrf -> pipe14
261 |     rob -> sched11
262 |     rob -> sched12
263 |     rob -> sched13
264 |     rob -> sched14
265 |   }
266 |   frontend.rename -> backend.rob
267 |   frontend.rename -> backend.bob
268 | 
269 |   mem: Memory {
270 |     l1: L1 DC {
271 |       # Source: Chips and Cheese, Anandtech, Qualcomm
272 |       l1dtlb: 224-entry 7-way L1 DTLB, 1 cycle latency
273 | 
274 |       # Source: Chips and Cheese, Anandtech
275 |       l1dc: 96KB 6-way L1DC, 56-entry MSHR
276 |     }
277 | 
278 |     l2: L2 {
279 |       # Source: Chips and Cheese
280 |       l2tlb: 8192-entry 8-way L2 TLB
281 | 
282 |       # Source: Chips and Cheese, Anandtech
283 |       l2dc: 12MB 12-way L2 Cache per 4-Core Cluster
284 |     }
285 |     l1 -> l2: 64B/cycle
286 | 
287 |     # Source: Chips and Cheese, Anandtech
288 |     slc: 6MB System Level Cache
289 |     l2 -> slc: 32B/cycle
290 | 
291 |     # Source: Anandtech
292 |     dram: LPDDR5x 8448 MT/s, 8x 16b
293 |     slc -> dram
294 |   }
295 |   frontend.l1ic -> mem.l2
296 |   backend.lsu -> mem.l1
297 | 
298 |   info: |md
299 |     Drawn by Jiajie Chen @jiegec
300 | 
301 |     Based on data from Chips and Cheese, Anandtech and Qualcomm
302 |   |
303 | }


--------------------------------------------------------------------------------
/docs/oryon.md:
--------------------------------------------------------------------------------
 1 | # Qualcomm Oryon
 2 | 
 3 | ![](./oryon.svg)
 4 | 
 5 | Oryon-M in Qualcomm 8 Elite:
 6 | 
 7 | 1. 4-wide decode instead of 8
 8 | 2. 4 integer pipelines instead of 6
 9 | 3. 2 load store pipelines instead of 4
10 | 4. 2 fp/simd pipelines instead of 4
11 | 
12 | References:
13 | 
14 | - [Qualcomm’s Oryon Core: A Long Time in the Making - Chips and Cheese](https://chipsandcheese.com/2024/07/09/qualcomms-oryon-core-a-long-time-in-the-making/)
15 | - [The Qualcomm Snapdragon X Architecture Deep Dive: Getting To Know Oryon and Adreno X1 - Anandtech](https://www.anandtech.com/show/21445/qualcomm-snapdragon-x-architecture-deep-dive)
16 | - Snapdragon X Series - Architecture Overview
17 | - [高通 X Elite Oryon 微架构评测：走走停停](https://zhuanlan.zhihu.com/p/704707254)
18 | - [Snapdragon X Elite](https://www.qualcomm.com/products/mobile/snapdragon/laptops-and-tablets/snapdragon-x-elite)
19 | - [Qualcomm Oryon CPU](https://www.qualcomm.com/products/technology/processors/oryon)
20 | - [Qualcomm’s Oryon LLVM Patches](https://chipsandcheese.com/2024/05/15/qualcomms-oryon-llvm-patches/)
21 | - [高通自研 PC 芯片 X Elite 实测：真能干翻苹果英特尔？](https://www.bilibili.com/video/BV1Ue41197Qb/)
22 | - [太贵了，它没你想的那么美好！高通骁龙 X Elite 78-100 笔记本详细评测](https://www.bilibili.com/video/BV1z1421r7dZ/)
23 | - [AArch64SchedOryon.td in LLVM](https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AArch64/AArch64SchedOryon.td)
24 | - [Hot Chips 2024: Qualcomm’s Oryon Core](https://chipsandcheese.com/2024/08/26/hot-chips-2024-qualcomms-oryon-core/)
25 | - [骁龙 8 Elite 首发评测：一加13能效有多好？](https://www.bilibili.com/video/BV1xvysYwEcX/)
26 | - [高通 X Elite 深度分析：年度最自信 CPU](https://b23.tv/iL38AXz)
27 | 


--------------------------------------------------------------------------------
/docs/p550.d2:
--------------------------------------------------------------------------------
  1 | cpu : SiFive P550 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Chips and Cheese
  5 |       ras: 16-entry RAS
  6 | 
  7 |       # Source: Chips and Cheese
  8 |       l1btb: 32-entry L1 BTB
  9 |     }
 10 | 
 11 |     l1ic: L1 IC {
 12 |       # Source: Chips and Cheese
 13 |       l1ic: 32KB 4-way L1 IC
 14 | 
 15 |       # Source: Chips and Cheese
 16 |       l1itlb: L1 ITLB
 17 |     }
 18 | 
 19 |     bp -> l1ic
 20 | 
 21 |     # Source: Chips and Cheese
 22 |     decode: 3-way Decode
 23 |     l1ic -> decode: 12 bytes/cycle
 24 |     decode -> bp
 25 | 
 26 |     # Source: Chips and Cheese
 27 |     rename: 3-way Rename
 28 |     decode -> rename
 29 |   }
 30 | 
 31 |   backend: Backend {
 32 |     # Source: Chips and Cheese
 33 |     rob: 96-entry ROB
 34 | 
 35 |     rf: Register File {
 36 |       # Source: Chips and Cheese
 37 |       irf: 128-entry Integer Register File
 38 | 
 39 |       # Source: Chips and Cheese
 40 |       frf: 119-entry FP Register File
 41 |     }
 42 | 
 43 |     # Source: Chips and Cheese
 44 |     sched1: Scheduler \#1
 45 | 
 46 |     # Source: Chips and Cheese
 47 |     pipe1: Pipe \#1 {
 48 |       ALU
 49 |       Branch
 50 |     }
 51 |     rob -> sched1 -> rf -> pipe1
 52 | 
 53 |     # Source: Chips and Cheese
 54 |     sched2: Scheduler \#2
 55 | 
 56 |     # Source: Chips and Cheese
 57 |     pipe2: Pipe \#2 {
 58 |       ALU
 59 |     }
 60 |     rob -> sched2 -> rf -> pipe2
 61 | 
 62 |     # Source: Chips and Cheese
 63 |     sched3: Scheduler \#3
 64 | 
 65 |     # Source: Chips and Cheese
 66 |     pipe3: Pipe \#3 {
 67 |       ALU
 68 |       MUL
 69 |     }
 70 |     rob -> sched3 -> rf -> pipe3
 71 | 
 72 |     # Source: Chips and Cheese
 73 |     sched4: Scheduler \#4
 74 | 
 75 |     # Source: Chips and Cheese
 76 |     pipe4: Pipe \#4 {
 77 |       Load AGU
 78 |     }
 79 |     rob -> sched4 -> rf -> pipe4
 80 | 
 81 |     # Source: Chips and Cheese
 82 |     sched5: Scheduler \#5
 83 | 
 84 |     # Source: Chips and Cheese
 85 |     pipe5: Pipe \#5 {
 86 |       Store AGU
 87 |     }
 88 |     rob -> sched5 -> rf -> pipe5
 89 | 
 90 |     # Source: Chips and Cheese
 91 |     sched6: Scheduler \#6
 92 | 
 93 |     # Source: Chips and Cheese
 94 |     pipe6: Pipe \#6 {
 95 |       FMA
 96 |     }
 97 |     rob -> sched6 -> rf -> pipe6
 98 | 
 99 |     # Source: Chips and Cheese
100 |     pipe7: Pipe \#7 {
101 |       FMA
102 |     }
103 |     rob -> sched6 -> rf -> pipe7
104 | 
105 |     lsu: LSU {
106 |       # Source: Chips and Cheese
107 |       20-entry Load Queue
108 |       15-entry Store Queue
109 |     }
110 | 
111 |     pipe4 -> lsu
112 |     pipe5 -> lsu
113 |   }
114 |   frontend.rename -> backend.rob
115 | 
116 |   mem: Memory {
117 |     l1: L1 DC {
118 |       # Source: Chips and Cheese
119 |       l1dc: 32KB 4-way L1DC
120 |       l1dtlb: 32-entry L1 DTLB
121 |       l2tlb: L2 TLB
122 |     }
123 | 
124 |     # Source: Chips and Cheese
125 |     l2: 256KB 8-way L2 DC
126 |     l1 -> l2
127 |   }
128 |   backend.lsu -> mem.l1
129 | 
130 |   info: |md
131 |     Drawn by Jiajie Chen @jiegec
132 | 
133 |     Based on data from Chips and Cheese
134 |   |
135 | }


--------------------------------------------------------------------------------
/docs/p550.md:
--------------------------------------------------------------------------------
1 | # SiFive P550
2 | 
3 | ![](./p550.svg)
4 | 
5 | References:
6 | 
7 | - [Inside SiFive’s P550 Microarchitecture](https://chipsandcheese.com/p/inside-sifives-p550-microarchitecture)
8 | - [A RISC-V Progress Check: Benchmarking P550 and C910](https://chipsandcheese.com/p/a-risc-v-progress-check-benchmarking)
9 | 


--------------------------------------------------------------------------------
/docs/p870.d2:
--------------------------------------------------------------------------------
  1 | cpu : SiFive P870 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: SiFive
  5 |       nlp: 1024-entry Next Line Predictor
  6 | 
  7 |       # Source: SiFive
  8 |       ras: 64-entry RAS
  9 | 
 10 |       # Source: SiFive
 11 |       cbp: 16384-entry TAGE Direction Predictor
 12 | 
 13 |       # Source: SiFive
 14 |       ibp: 2560-entry Indirect Predictor
 15 |     }
 16 | 
 17 |     # Coupled Frontend
 18 |     l1ic: L1 IC {
 19 |       # Source: SiFive
 20 |       l1ic: 64KB L1 IC
 21 | 
 22 |       # Source: SiFive
 23 |       l1itlb: 32-entry L1 ITLB
 24 |     }
 25 | 
 26 |     bp -> l1ic
 27 | 
 28 |     # Source: SiFive
 29 |     decode: 6-way Decode
 30 |     l1ic -> decode: 36 bytes/cycle
 31 |     decode -> bp
 32 | 
 33 |     # Source: SiFive
 34 |     rename: 6-way Rename
 35 |     decode -> rename
 36 |   }
 37 | 
 38 |   backend: Backend {
 39 |     # Source: SiFive
 40 |     rob: 1120-entry ROB
 41 | 
 42 |     rf: Register File {
 43 |       # Source: SiFive
 44 |       irf: 228-entry Integer Register File
 45 | 
 46 |       # Source: SiFive
 47 |       frf: 240-entry FP Register File
 48 | 
 49 |       # Source: SiFive
 50 |       vrf: 128-entry Vector Register File
 51 |     }
 52 | 
 53 |     # Source: SiFive
 54 |     dispatch1: Integer Dispatch Queue \#1
 55 | 
 56 |     # Source: SiFive
 57 |     sched1: 16-entry Scheduler \#1
 58 | 
 59 |     # Source: SiFive
 60 |     pipe1: Pipe \#1 {
 61 |       ALU
 62 |       DIV
 63 |     }
 64 |     dispatch1 -> sched1 -> rf -> pipe1
 65 | 
 66 |     # Source: SiFive
 67 |     sched2: 16-entry Scheduler \#2
 68 | 
 69 |     # Source: SiFive
 70 |     pipe2: Pipe \#2 {
 71 |       ALU
 72 |       MUL
 73 |     }
 74 |     dispatch1 -> sched2 -> rf -> pipe2
 75 | 
 76 |     # Source: SiFive
 77 |     sched3: 16-entry Scheduler \#3
 78 | 
 79 |     # Source: SiFive
 80 |     pipe3: Pipe \#3 {
 81 |       ALU
 82 |       MUL
 83 |     }
 84 |     dispatch1 -> sched3 -> rf -> pipe3
 85 | 
 86 |     # Source: SiFive
 87 |     sched4: 16-entry Scheduler \#4
 88 | 
 89 |     # Source: SiFive
 90 |     pipe4: Pipe \#4 {
 91 |       ALU
 92 |     }
 93 |     dispatch1 -> sched4 -> rf -> pipe4
 94 | 
 95 |     # Source: SiFive
 96 |     sched5: 16-entry Scheduler \#5
 97 | 
 98 |     # Source: SiFive
 99 |     pipe5: Pipe \#5 {
100 |       BR
101 |       ALU
102 |     }
103 |     dispatch1 -> sched5 -> rf -> pipe5
104 | 
105 |     # Source: SiFive
106 |     sched6: 16-entry Scheduler \#6
107 | 
108 |     # Source: SiFive
109 |     pipe6: Pipe \#6 {
110 |       BR
111 |     }
112 |     dispatch1 -> sched6 -> rf -> pipe6
113 | 
114 |     # Source: SiFive
115 |     dispatch2: Memory Dispatch Queue \#2
116 | 
117 |     # Source: SiFive
118 |     sched7: 32-entry Scheduler \#7
119 | 
120 |     # Source: SiFive
121 |     pipe7: Pipe \#7 {
122 |       AGU
123 |       LD
124 |     }
125 |     dispatch2 -> sched7 -> rf -> pipe7
126 | 
127 |     # Source: SiFive
128 |     pipe8: Pipe \#8 {
129 |       AGU
130 |       LDST
131 |     }
132 |     dispatch2 -> sched7 -> rf -> pipe8
133 | 
134 |     # Source: SiFive
135 |     pipe9: Pipe \#9 {
136 |       AGU
137 |       LDST
138 |     }
139 |     dispatch2 -> sched7 -> rf -> pipe9
140 | 
141 |     lsu: LSU {
142 |       # Source: SiFive
143 |       48-entry Load Queue
144 |       48-entry Store Queue
145 |       1 Load Pipe
146 |       2 Load/Store pipe
147 |     }
148 | 
149 |     pipe7 -> lsu
150 |     pipe8 -> lsu
151 |     pipe9 -> lsu
152 | 
153 |     # Source: SiFive
154 |     dispatch3: FP Dispatch Queue \#3
155 | 
156 |     # Source: SiFive
157 |     sched8: 24-entry Scheduler \#8
158 | 
159 |     # Source: SiFive
160 |     pipe10: Pipe \#13 {
161 |       FADD
162 |       FMUL
163 |       FMAC
164 |     }
165 |     dispatch3 -> sched8 -> rf -> pipe10
166 | 
167 |     # Source: SiFive
168 |     sched9: 24-entry Scheduler \#9
169 | 
170 |     # Source: SiFive
171 |     pipe11: Pipe \#11 {
172 |       FADD
173 |       FMUL
174 |       FMAC
175 |       FDIV
176 |       FSQRT
177 |     }
178 |     dispatch3 -> sched9 -> rf -> pipe11
179 | 
180 |     # Source: SiFive
181 |     seq: Vector Sequencer
182 | 
183 |     # Source: SiFive
184 |     dispatch4: Vector Dispatch Queue \#4
185 | 
186 |     # Source: SiFive
187 |     sched10: 16-entry Scheduler \#10
188 | 
189 |     # Source: SiFive
190 |     pipe12: Pipe \#12 {
191 |       Vec ADD
192 |       Vec MUL
193 |       Vec MAC
194 |       Vec Crypto
195 |       Vec Div
196 |       Vec Permute
197 |     }
198 |     dispatch4 -> sched10 -> rf -> pipe12
199 | 
200 |     # Source: SiFive
201 |     sched11: 16-entry Scheduler \#11
202 | 
203 |     # Source: SiFive
204 |     pipe13: Pipe \#13 {
205 |       Vec ADD
206 |       Vec MUL
207 |       Vec MAC
208 |       Vec Crypto
209 |       Vec Mask
210 |     }
211 |     dispatch4 -> sched11 -> rf -> pipe13
212 | 
213 |     rob -> dispatch1
214 |     rob -> dispatch2
215 |     rob -> dispatch3
216 |     rob -> seq -> dispatch4
217 |   }
218 |   frontend.rename -> backend.rob
219 | 
220 |   mem: Memory {
221 |     l1: L1 DC {
222 |       # Source: SiFive
223 |       l1dc: 64KB L1DC
224 |       l1dtlb: 64-entry L1 DTLB
225 |       l2tlb: 1024-entry L2 TLB
226 |     }
227 |   }
228 |   backend.lsu -> mem.l1
229 | 
230 |   info: |md
231 |     Drawn by Jiajie Chen @jiegec
232 | 
233 |     Based on data from SiFive
234 |   |
235 | }


--------------------------------------------------------------------------------
/docs/p870.md:
--------------------------------------------------------------------------------
1 | # SiFive P870
2 | 
3 | ![](./p870.svg)
4 | 
5 | References:
6 | 
7 | - [P870 High-Performance RISC-V Processor](https://hc2023.hotchips.org/assets/program/conference/day1/CPU2/P870%20for%20Hot%20Chips%20-%20FInal.pdf)
8 | 


--------------------------------------------------------------------------------
/docs/redwood_cove.d2:
--------------------------------------------------------------------------------
  1 | cpu: Intel Meteor Lake P-core Redwood Cove CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Chips and Cheese
  5 |       l1btb: 128-entry L1 BTB, 0 bubble
  6 | 
  7 |       # Source: Chips and Cheese
  8 |       # The figure says another 6K-entry BTB level, but it is not shown in
  9 |       # testing result
 10 |       l2btb: 12K-entry L2 BTB, 1 bubble
 11 |     }
 12 | 
 13 |     l1ic: L1 IC {
 14 |       # Source: Intel
 15 |       # "Larger instruction cache: 32K→64K."
 16 |       l1ic: 64KB L1 IC
 17 | 
 18 |       # Source: Chips and Cheese
 19 |       l1itlb: 256-entry 8-way L1 ITLB
 20 |     }
 21 | 
 22 |     bp -> l1ic
 23 | 
 24 |     iq: Instruction Queue
 25 |     l1ic -> iq
 26 | 
 27 |     # Source: Intel
 28 |     decode: 6-way Decode
 29 |     iq -> decode
 30 | 
 31 |     # Source: Chips and Cheese
 32 |     uopc: 4096-entry UOP Cache
 33 |     decode -> uopc
 34 |     bp -> uopc
 35 | 
 36 |     # Source: Intel
 37 |     # "Improved LSD coverage: the IDQ can hold 192 μops per logical processor in
 38 |     # single-thread mode or 96 μops per thread when SMT is active."
 39 |     uop: 2x96 UOP Queue
 40 |     uopc -> uop: 8 UOP/cycle
 41 |     decode -> uop: 6 UOP/cycle
 42 | 
 43 |     # Source: Intel
 44 |     rename: 6-way Rename {
 45 |       Move Elimination
 46 |       Zero Idiom
 47 |     }
 48 |     uop -> rename
 49 |   }
 50 | 
 51 |   backend: Backend {
 52 |     # Source: Intel
 53 |     # "8 -> 12 wide retirement" and "512 -> 576 deep instruction window" in
 54 |     # Redwood Cove vs Lion Cove comparison
 55 |     rob: 512-entry ROB, 8 wide retirement
 56 | 
 57 |     # Source: Chips and Cheese
 58 |     bob: 128-entry Branch Order Buffer
 59 | 
 60 |     rf: Register File {
 61 |       # Source: Chips and Cheese
 62 |       irf: 280-entry Integer Register File
 63 | 
 64 |       flagsrf: Flags Register File
 65 | 
 66 |       # Source: Chips and Cheese
 67 |       vrf: 332-entry FP/Vector Register File
 68 |     }
 69 | 
 70 |     # Source: Chips and Cheese
 71 |     sched1: 97-entry Unified Math Scheduler
 72 | 
 73 |     # Source: Intel
 74 |     pipe1: Port 0 {
 75 |       grid-columns: 1
 76 |       ALU
 77 |       LEA
 78 |       Shift
 79 |       JMP
 80 |       Vec FMA
 81 |       Vec ALU
 82 |       Vec Shift
 83 |       Vec fpDIV
 84 |     }
 85 |     rob -> sched1 -> rf -> pipe1
 86 | 
 87 |     # Source: Intel
 88 |     pipe2: Port 1 {
 89 |       grid-columns: 1
 90 |       ALU
 91 |       LEA
 92 |       Shift
 93 |       Int DIV
 94 |       Vec FMA
 95 |       Vec ALU
 96 |       Vec Shift
 97 |       Vec Shuffle
 98 |       Vec FADD
 99 |     }
100 |     rob -> sched1 -> rf -> pipe2
101 | 
102 |     # Source: Intel
103 |     pipe3: Port 5 {
104 |       grid-columns: 1
105 |       ALU
106 |       LEA
107 |       MulHi
108 |       Vec FMA512
109 |       Vec ALU
110 |       Vec AMX
111 |       Vec Shuffle
112 |       Vec FADD
113 |     }
114 |     rob -> sched1 -> rf -> pipe3
115 | 
116 |     # Source: Intel
117 |     pipe4: Port 6 {
118 |       grid-columns: 1
119 |       ALU
120 |       LEA
121 |       Shift
122 |       JMP
123 |     }
124 |     rob -> sched1 -> rf -> pipe4
125 | 
126 |     # Source: Intel
127 |     pipe5: Port 11 {
128 |       grid-columns: 1
129 |       ALU
130 |       LEA
131 |     }
132 |     rob -> sched1 -> rf -> pipe5
133 | 
134 |     # Source: Chips and Cheese
135 |     sched2: 70-entry Load Scheduler
136 | 
137 |     # Source: Intel
138 |     pipe6: Port 2 {
139 |       Load AGU
140 |     }
141 |     rob -> sched2 -> rf -> pipe6
142 | 
143 |     # Source: Intel
144 |     pipe7: Port 3 {
145 |       Load AGU
146 |     }
147 |     rob -> sched2 -> rf -> pipe7
148 | 
149 |     # Source: Intel
150 |     pipe8: Port 10 {
151 |       Load AGU
152 |     }
153 |     rob -> sched2 -> rf -> pipe8
154 | 
155 |     # Source: Chips and Cheese
156 |     sched3: 38-entry Store Scheduler
157 | 
158 |     # Source: Intel
159 |     pipe9: Port 7 {
160 |       Store AGU
161 |     }
162 |     rob -> sched3 -> rf -> pipe9
163 | 
164 |     # Source: Intel
165 |     pipe10: Port 8 {
166 |       Store AGU
167 |     }
168 |     rob -> sched3 -> rf -> pipe10
169 | 
170 |     # Source: Intel
171 |     pipe11: Port 4 {
172 |       Store Data
173 |     }
174 |     rob -> sched3 -> rf -> pipe11
175 | 
176 |     # Source: Intel
177 |     pipe12: Port 9 {
178 |       Store Data
179 |     }
180 |     rob -> sched3 -> rf -> pipe12
181 | 
182 |     lsu: LSU {
183 |       # Source: Chips and Cheese
184 |       lq: 192-entry Load Queue
185 |       sq: 114-entry Store Queue
186 | 
187 |       # Source: Intel
188 |       # Redwood Cove vs Lion Cove comparison
189 |       5 cycle load to use latency
190 |       3x256b or 2x512b read per cycle
191 |     }
192 | 
193 |     pipe6 -> lsu
194 |     pipe7 -> lsu
195 |     pipe8 -> lsu
196 |     pipe9 -> lsu
197 |     pipe10 -> lsu
198 |     pipe11 -> lsu
199 |     pipe12 -> lsu
200 |   }
201 |   frontend.rename -> backend.rob
202 |   frontend.rename -> backend.bob
203 | 
204 |   mem: Memory {
205 |     l1: L1 DC {
206 |       # Source: Intel
207 |       # "96 -> 128 pages DTLB" in 
208 |       # Redwood Cove vs Lion Cove comparison
209 |       l1dtlb: 96-entry L1 DTLB
210 | 
211 |       # Source: Intel
212 |       # Redwood Cove vs Lion Cove comparison
213 |       l1dc: 48KB L1DC
214 |     }
215 | 
216 |     l2: L2 {
217 |       # Source: Chips and Cheese
218 |       l2tlb: 2048-entry L2 TLB
219 | 
220 |       # Source: Intel
221 |       # "Mid-level-cache size increased to 2MBs for Client."
222 |       l2dc: 2MB L2 (Mid Level) Cache
223 | 
224 |       # Source: Intel
225 |       # "Increased number of outstanding misses (48→64 Deeper MLC miss queues)."
226 |       mshr: 64-entry MSHR
227 | 
228 |       # Source: Intel
229 |       16 cycle load to use latency
230 |       # Source: Intel
231 |       2x64B read per cycle
232 |     }
233 | 
234 |     l1 -> l2
235 |   }
236 |   frontend.l1ic -> mem.l2
237 |   backend.lsu -> mem.l1
238 | 
239 |   info: |md
240 |     Drawn by Jiajie Chen @jiegec
241 | 
242 |     Based on data from Chips and Cheese, Intel and Anandtech
243 |   |
244 | }
245 | 


--------------------------------------------------------------------------------
/docs/redwood_cove.md:
--------------------------------------------------------------------------------
 1 | # Intel Meteor Lake P-core aka Redwood Cove
 2 | 
 3 | ![](./redwood_cove.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Previewing Meteor Lake at CES](https://chipsandcheese.com/2024/01/11/previewing-meteor-lake-at-ces/)
 8 | - [Intel Unveils Meteor Lake Architecture: Intel 4 Heralds the Disaggregated Future of Mobile CPUs](https://www.anandtech.com/show/20046/intel-unveils-meteor-lake-architecture-intel-4-heralds-the-disaggregated-future-of-mobile-cpus/2)
 9 | - [2023 Intel Tech Tour: Meteor Lake Architecture Overview](https://www.intel.com/content/www/us/en/content-details/788851/2023-intel-tech-tour-meteor-lake-architecture-overview.html)
10 | - [Intel’s Redwood Cove: Baby Steps are Still Steps](https://chipsandcheese.com/2024/09/22/intels-redwood-cove-baby-steps-are-still-steps/)
11 | 


--------------------------------------------------------------------------------
/docs/skylake.d2:
--------------------------------------------------------------------------------
  1 | cpu : Intel Skylake CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |     }
  5 | 
  6 |     l1ic: L1 IC {
  7 |       # Source: Wikichip
  8 |       l1ic: 32KB 8-way L1 IC
  9 |     }
 10 | 
 11 |     bp -> l1ic
 12 | 
 13 |     # Source: Wikichip
 14 |     iq: 2x 25-entry Instruction Queue
 15 |     l1ic -> iq
 16 | 
 17 |     # Source: Wikichip
 18 |     decode: 4-way Decode
 19 |     iq -> decode
 20 | 
 21 |     # Source: Wikichip
 22 |     uopc: 1536-entry 8-way UOP Cache
 23 |     decode -> uopc
 24 |     bp -> uopc
 25 | 
 26 |     # Source: Wikichip
 27 |     uop: 2x 64-entry UOP Queue
 28 |     uopc -> uop
 29 |     decode -> uop
 30 | 
 31 |     # Source: Wikichip
 32 |     rename: Rename {
 33 |       Move Elimination
 34 |       Zero Idiom
 35 |     }
 36 |     uop -> rename
 37 |   }
 38 | 
 39 |   backend: Backend {
 40 |     # Source: Wikichip
 41 |     rob: ROB
 42 | 
 43 |     # Source: Wikichip
 44 |     bob: 48-entry Branch Order Buffer
 45 | 
 46 |     rf: "" {
 47 |       # Source: Wikichip
 48 |       # Source: jiegec, 141 speculative
 49 |       irf: 180-entry Integer Register File
 50 | 
 51 |       # Source: jiegec, 141 speculative
 52 |       flagsrf: 141-entry Flags Register File
 53 | 
 54 |       # Source: Wikichip
 55 |       # Source: jiegec, 106 speculative
 56 |       vrf: 168-entry FP/Vector Register File
 57 |     }
 58 | 
 59 |     # Source: Wikichip
 60 |     sched1: 97-entry Unified Scheduler
 61 | 
 62 |     # Source: Wikichip
 63 |     pipe1: Port 0 {
 64 |       grid-columns: 1
 65 |       ALU
 66 |       INT DIV
 67 |       INT Vec ALU
 68 |       INT Vec MUL
 69 |       FMA
 70 |       AES
 71 |       Vec String
 72 |       FDIV
 73 |       Branch
 74 |     }
 75 |     rob -> sched1 -> rf -> pipe1
 76 | 
 77 |     # Source: Wikichip
 78 |     pipe2: Port 1 {
 79 |       grid-columns: 1
 80 |       ALU
 81 |       MUL
 82 |       INT Vec ALU
 83 |       INT Vec MUL
 84 |       FMA
 85 |       Bit Scan
 86 |     }
 87 |     rob -> sched1 -> rf -> pipe2
 88 | 
 89 |     # Source: Wikichip
 90 |     pipe3: Port 5 {
 91 |       grid-columns: 1
 92 |       ALU
 93 |       Vec Shuffle
 94 |       INT Vec ALU
 95 |       LEA
 96 |     }
 97 |     rob -> sched1 -> rf -> pipe3
 98 | 
 99 |     # Source: Wikichip
100 |     pipe4: Port 6 {
101 |       grid-columns: 1
102 |       ALU
103 |       Branch
104 |     }
105 |     rob -> sched1 -> rf -> pipe4
106 | 
107 |     # Source: Wikichip
108 |     pipe5: Port 2 {
109 |       AGU
110 |       Load Data
111 |     }
112 |     rob -> sched1 -> rf -> pipe5
113 | 
114 |     # Source: Wikichip
115 |     pipe6: Port 3 {
116 |       AGU
117 |       Load Data
118 |     }
119 |     rob -> sched1 -> rf -> pipe6
120 | 
121 |     # Source: Wikichip
122 |     pipe7: Port 4 {
123 |       Store Data
124 |     }
125 |     rob -> sched1 -> rf -> pipe7
126 | 
127 |     # Source: Wikichip
128 |     pipe8: Port 7 {
129 |       AGU
130 |     }
131 |     rob -> sched1 -> rf -> pipe8
132 | 
133 | 
134 |     lsu: LSU {
135 |       # Source: Wikichip
136 |       72-entry Load Queue
137 |       56-entry Store Queue
138 |     }
139 | 
140 |     pipe5 -> lsu
141 |     pipe6 -> lsu
142 |     pipe7 -> lsu
143 |     pipe8 -> lsu
144 |   }
145 |   frontend.rename -> backend.rob
146 |   frontend.rename -> backend.bob
147 | 
148 |   mem: Memory {
149 |     l1: L1 DC {
150 |       # Source: Wikichip
151 |       l1dc: 32KB 8-way L1DC
152 | 
153 |       # Source: Wikichip
154 |       mshr: 10-entry MSHR
155 |     }
156 | 
157 |     l2: L2 {
158 |       # Source: Wikichip, Intel
159 |       # "Skylake server microarchitecture implements a mid-level (L2) cache of 1
160 |       # MB capacity with a minimum load-to-use latency of 14 cycles. The
161 |       # mid-level cache capacity is four times larger than the capacity in
162 |       # previous Intel Xeon processor family implementations. The line size of
163 |       # the mid-level cache is 64B and it is 16-way associative. The mid-level
164 |       # cache is private to each core."
165 |       l2dc: 256 KB 4-way/1MB 16-way L2 DC
166 |       14 cycle load to use latency
167 |     }
168 | 
169 |     # Source: Wikichip
170 |     l1 -> l2: 64B/cycle
171 | 
172 |     l3: L3 {
173 |       # Source: Wikichip
174 |       l3dc: L3 Cache
175 |     }
176 |     l2 -> l3
177 |   }
178 |   frontend.l1ic -> mem.l2
179 |   backend.lsu -> mem.l1
180 | 
181 |   info: |md
182 |     Drawn by Jiajie Chen @jiegec
183 | 
184 |     Based on data from Wikichip
185 |   |
186 | }


--------------------------------------------------------------------------------
/docs/skylake.md:
--------------------------------------------------------------------------------
1 | # Intel Skylake
2 | 
3 | ![](./skylake.svg)
4 | 
5 | References:
6 | 
7 | - [Skylake (client) - Microarchitectures - Intel](https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client))
8 | - [Skylake (server) - Microarchitectures - Intel ](https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(server))
9 | 


--------------------------------------------------------------------------------
/docs/skymont.d2:
--------------------------------------------------------------------------------
  1 | cpu : Intel Lunar Lake E-core Skymont CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |     }
  5 | 
  6 |     l1ic: L1 IC {
  7 |       # Source: Intel
  8 |       l1ic: 64K L1 IC
  9 |     }
 10 | 
 11 |     # Source: Intel
 12 |     fq: 3x Fetch Queue
 13 |     bp -> fq
 14 |     fq -> l1ic
 15 | 
 16 |     iq1: Instruction Queue \#1
 17 |     l1ic -> iq1
 18 | 
 19 |     # Source: Intel
 20 |     decode1: 3-way Decode \#1
 21 |     iq1 -> decode1
 22 | 
 23 |     # Source: Intel
 24 |     # "Uop queue capacity: 64 -> 96 entries"
 25 |     uop1: 32-entry UOP Queue \#1
 26 |     decode1 -> uop1
 27 | 
 28 |     iq2: Instruction Queue \#2
 29 |     l1ic -> iq2
 30 | 
 31 |     # Source: Intel
 32 |     decode2: 3-way Decode \#2
 33 |     iq2 -> decode2
 34 | 
 35 |     # Source: Intel
 36 |     uop2: 32-entry UOP Queue \#2
 37 |     decode2 -> uop2
 38 | 
 39 |     # Source: Intel
 40 |     iq3: Instruction Queue \#3
 41 |     l1ic -> iq3
 42 | 
 43 |     # Source: Intel
 44 |     decode3: 3-way Decode \#3
 45 |     iq3 -> decode3
 46 | 
 47 |     # Source: Intel
 48 |     uop3: 32-entry UOP Queue \#3
 49 |     decode3 -> uop3
 50 | 
 51 |     # Source: Intel
 52 |     # "allocation (from 6)"
 53 |     rename: 8-way Rename {
 54 |       Move Elimination
 55 |       Zero Idiom
 56 |     }
 57 |     uop1 -> rename
 58 |     uop2 -> rename
 59 |     uop3 -> rename
 60 |   }
 61 | 
 62 |   backend: Backend  {
 63 |     # Source: Intel
 64 |     # "Larger out of order window (ROB 256 -> 416 entry)"
 65 |     # "16-wide retire (from 8)"
 66 |     rob: 416-entry ROB, 16-wide retirement
 67 | 
 68 |     rf: Register File {
 69 |       irf: Integer Register File
 70 | 
 71 |       vrf: Vector Register File
 72 |     }
 73 | 
 74 |     sched1: Scheduler \#1
 75 | 
 76 |     # Source: Intel
 77 |     pipe1: Port 0 {
 78 |       grid-columns: 1
 79 |       ALU
 80 |       SHIFT
 81 |       MUL
 82 |       DIV
 83 |     }
 84 |     rob -> sched1 -> rf -> pipe1
 85 | 
 86 |     sched2: Scheduler \#2
 87 | 
 88 |     # Source: Intel
 89 |     pipe2: Port 5 {
 90 |       grid-columns: 1
 91 |       ALU
 92 |     }
 93 |     rob -> sched2 -> rf -> pipe2
 94 | 
 95 |     sched3: Scheduler \#3
 96 | 
 97 |     # Source: Intel
 98 |     pipe3: Port 2 {
 99 |       grid-columns: 1
100 |       ALU
101 |       SHIFT
102 |       MUL
103 |       DIV
104 |     }
105 |     rob -> sched3 -> rf -> pipe3
106 | 
107 |     sched4: Scheduler \#4
108 | 
109 |     # Source: Intel
110 |     pipe4: Port 6 {
111 |       grid-columns: 1
112 |       ALU
113 |     }
114 |     rob -> sched4 -> rf -> pipe4
115 | 
116 |     sched5: Scheduler \#5
117 | 
118 |     # Source: Intel
119 |     pipe5: Port 0 {
120 |       grid-columns: 1
121 |       ALU
122 |       SHIFT
123 |     }
124 |     rob -> sched5 -> rf -> pipe5
125 | 
126 |     sched6: Scheduler \#6
127 | 
128 |     # Source: Intel
129 |     pipe6: Port 0 {
130 |       grid-columns: 1
131 |       ALU
132 |     }
133 |     rob -> sched6 -> rf -> pipe6
134 | 
135 |     sched7: Scheduler \#7
136 | 
137 |     # Source: Intel
138 |     pipe7: Port 3 {
139 |       grid-columns: 1
140 |       ALU
141 |       SHIFT
142 |     }
143 |     rob -> sched7 -> rf -> pipe7
144 | 
145 |     sched8: Scheduler \#8
146 | 
147 |     # Source: Intel
148 |     pipe8: Port 7 {
149 |       grid-columns: 1
150 |       ALU
151 |     }
152 |     rob -> sched8 -> rf -> pipe8
153 | 
154 |     sched9: Scheduler \#9
155 | 
156 |     # Source: Intel
157 |     pipe9: Port 30 {
158 |       grid-columns: 1
159 |       JMP
160 |     }
161 |     rob -> sched9 -> rf -> pipe9
162 | 
163 |     sched10: Scheduler \#10
164 | 
165 |     # Source: Intel
166 |     pipe10: Port 31 {
167 |       grid-columns: 1
168 |       JMP
169 |     }
170 |     rob -> sched10 -> rf -> pipe10
171 | 
172 |     sched11: Scheduler \#11
173 | 
174 |     # Source: Intel
175 |     pipe11: Port 32 {
176 |       grid-columns: 1
177 |       JMP
178 |     }
179 |     rob -> sched11 -> rf -> pipe11
180 | 
181 |     sched12: Scheduler \#12
182 | 
183 |     # Source: Intel
184 |     pipe12: Port 8 {
185 |       grid-columns: 1
186 |       STD
187 |     }
188 |     rob -> sched12 -> rf -> pipe12
189 | 
190 |     sched13: Scheduler \#13
191 | 
192 |     # Source: Intel
193 |     pipe13: Port 9 {
194 |       grid-columns: 1
195 |       STD
196 |     }
197 |     rob -> sched13 -> rf -> pipe13
198 | 
199 |     sched14: Scheduler \#14
200 | 
201 |     # Source: Intel
202 |     pipe14: Port 10 {
203 |       grid-columns: 1
204 |       LD AGU
205 |     }
206 |     rob -> sched14 -> rf -> pipe14
207 | 
208 |     sched15: Scheduler \#15
209 | 
210 |     # Source: Intel
211 |     pipe15: Port 11 {
212 |       grid-columns: 1
213 |       LD AGU
214 |     }
215 |     rob -> sched15 -> rf -> pipe15
216 | 
217 |     sched16: Scheduler \#16
218 | 
219 |     # Source: Intel
220 |     pipe16: Port 12 {
221 |       grid-columns: 1
222 |       LD AGU
223 |     }
224 |     rob -> sched16 -> rf -> pipe16
225 | 
226 |     sched17: Scheduler \#17
227 | 
228 |     # Source: Intel
229 |     pipe17: Port 13 {
230 |       grid-columns: 1
231 |       ST AGU
232 |     }
233 |     rob -> sched17 -> rf -> pipe17
234 | 
235 |     sched18: Scheduler \#18
236 | 
237 |     # Source: Intel
238 |     pipe18: Port 14 {
239 |       grid-columns: 1
240 |       ST AGU
241 |     }
242 |     rob -> sched18 -> rf -> pipe18
243 | 
244 |     sched19: Scheduler \#19
245 | 
246 |     # Source: Intel
247 |     pipe19: Port 15 {
248 |       grid-columns: 1
249 |       ST AGU
250 |     }
251 |     rob -> sched19 -> rf -> pipe19
252 | 
253 |     sched20: Scheduler \#20
254 | 
255 |     # Source: Intel
256 |     pipe20: Port 16 {
257 |       grid-columns: 1
258 |       ST AGU
259 |     }
260 |     rob -> sched20 -> rf -> pipe20
261 | 
262 |     sched21: Scheduler \#21
263 | 
264 |     # Source: Intel
265 |     pipe21: Port 28 {
266 |       grid-columns: 1
267 |       VEC STD
268 |     }
269 |     rob -> sched21 -> rf -> pipe21
270 | 
271 |     sched22: Scheduler \#22
272 | 
273 |     # Source: Intel
274 |     pipe22: Port 29 {
275 |       grid-columns: 1
276 |       VEC STD
277 |     }
278 |     rob -> sched22 -> rf -> pipe22
279 | 
280 |     lsu: LSU {
281 |       Load Queue
282 |       Store Queue
283 |     }
284 | 
285 |     pipe12 -> lsu
286 |     pipe13 -> lsu
287 |     pipe14 -> lsu
288 |     pipe15 -> lsu
289 |     pipe16 -> lsu
290 |     pipe17 -> lsu
291 |     pipe18 -> lsu
292 |     pipe19 -> lsu
293 |     pipe20 -> lsu
294 |     pipe21 -> lsu
295 |     pipe22 -> lsu
296 | 
297 |     sched23: Scheduler \#23
298 | 
299 |     # Source: Intel
300 |     pipe23: Port 21 {
301 |       grid-columns: 1
302 |       SALU
303 |       SHUF
304 |       SIMUL
305 |       FADD
306 |       FMA
307 |     }
308 |     rob -> sched23 -> rf -> pipe23
309 | 
310 |     sched24: Scheduler \#24
311 | 
312 |     # Source: Intel
313 |     pipe24: Port 20 {
314 |       grid-columns: 1
315 |       SALU
316 |       SIMUL
317 |       SHUF
318 |       FADD
319 |       FDIV
320 |       AES
321 |       SHA
322 |       FMA
323 |     }
324 |     rob -> sched24 -> rf -> pipe24
325 | 
326 |     sched25: Scheduler \#25
327 | 
328 |     # Source: Intel
329 |     pipe25: Port 22 {
330 |       grid-columns: 1
331 |       SALU
332 |       SIMUL
333 |       SHUF
334 |       FADD
335 |       FDIV
336 |       AES
337 |       FMA
338 |     }
339 |     rob -> sched25 -> rf -> pipe25
340 | 
341 |     sched26: Scheduler \#26
342 | 
343 |     # Source: Intel
344 |     pipe26: Port 23 {
345 |       grid-columns: 1
346 |       SALU
347 |       SHUF
348 |       SIMUL
349 |       FADD
350 |       FMA
351 |     }
352 |     rob -> sched26 -> rf -> pipe26
353 |   }
354 | 
355 |   frontend.rename -> backend.rob
356 | 
357 |   mem: Memory {
358 |     l1: L1 DC {
359 |       # Source: Intel
360 |       l1dc: 32KB L1 DC
361 |     }
362 | 
363 |     l2: L2 {
364 |       # Source: Intel
365 |       l2dc: 4MB L2 Cache shared among 4 cores
366 |     }
367 | 
368 |     l1 -> l2
369 |   }
370 |   frontend.l1ic -> mem.l2
371 |   backend.lsu -> mem.l1
372 | 
373 |   info: |md
374 |     Drawn by Jiajie Chen @jiegec
375 | 
376 |     Based on data from Intel
377 |   |
378 | }


--------------------------------------------------------------------------------
/docs/skymont.md:
--------------------------------------------------------------------------------
 1 | # Intel Lunar Lake E-core aka Skymont
 2 | 
 3 | ![](./skymont.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Intel Details Skymont](https://chipsandcheese.com/2024/06/15/intel-details-skymont/)
 8 | - [Thoughts on Skymont Slides](https://chipsandcheese.com/2024/05/30/thoughts-on-skymont-slides/)
 9 | - [月光下的新探索：Lunar Lake CPU (Lion Cove / Skymont) 微架构测试](https://blog.hjc.im/lunar-lake-cpu-uarch-review.html)
10 | - [Skymont: Intel’s E-Cores reach for the Sky](https://chipsandcheese.com/2024/10/03/skymont-intels-e-cores-reach-for-the-sky/)
11 | 


--------------------------------------------------------------------------------
/docs/sunny_cove.d2:
--------------------------------------------------------------------------------
  1 | cpu : Intel Ice Lake Sunny Cove CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Chips and Cheese
  5 |       l1btb: 256-entry L1 BTB
  6 | 
  7 |       # Source: Chips and Cheese
  8 |       l2btb: 5120-entry L2 BTB
  9 | 
 10 |       # Source: Chips and Cheese
 11 |       ras: 22-entry RAS
 12 |     }
 13 | 
 14 |     l1ic: L1 IC {
 15 |       # Source: Chips and Cheese
 16 |       itlb: 128-entry 8-way ITLB
 17 | 
 18 |       # Source: Chips and Cheese
 19 |       l1ic: 32KB 8-way L1 IC
 20 |     }
 21 | 
 22 |     bp -> l1ic
 23 | 
 24 |     # Source: Chips and Cheese, Intel
 25 |     # "70/thread; 140/1 thread"
 26 |     iq: 2x 70-entry Instruction Queue
 27 |     l1ic -> iq
 28 | 
 29 |     # Source: Chips and Cheese
 30 |     decode: 4-way Decode
 31 |     iq -> decode
 32 | 
 33 |     # Source: Chips and Cheese
 34 |     uopc: 2304-entry UOP Cache
 35 |     decode -> uopc
 36 |     bp -> uopc
 37 | 
 38 |     # Source: Chips and Cheese
 39 |     # Source: Intel, called allocation queue
 40 |     uop: 2x 70-entry UOP Queue
 41 |     uopc -> uop
 42 |     decode -> uop
 43 | 
 44 |     # Source: Chips and Cheese
 45 |     rename: 5-way Rename {
 46 |       Move Elimination
 47 |       Zero Idiom
 48 |     }
 49 |     uop -> rename
 50 |   }
 51 | 
 52 |   backend: Backend {
 53 |     # Source: Chips and Cheese, Intel
 54 |     rob: 352-entry ROB
 55 | 
 56 |     # Source: Chips and Cheese
 57 |     bob: 96-entry Branch Order Buffer
 58 | 
 59 |     rf: "" {
 60 |       # Source: Chips and Cheese, Intel
 61 |       # Source: jiegec, 239 speculative
 62 |       irf: 280-entry Integer Register File
 63 | 
 64 |       # Source: Chips and Cheese, Intel
 65 |       # Source: jiegec, 158 speculative
 66 |       vrf: 224-entry FP/Vector Register File
 67 | 
 68 |       # Source: jiegec, 237 speculative
 69 |       flagsrf: 238-entry Flags Register File
 70 | 
 71 |       # Source: Chips and Cheese
 72 |       mxscrrf: 8-entry MXCSR Register File
 73 | 
 74 |       # Source: Chips and Cheese
 75 |       maskrf: 152-entry Mask Register File
 76 |     }
 77 | 
 78 |     # Source: uops.info
 79 |     # 10 ports in total: Port 0 to Port 9
 80 |     # Integer/Memory:
 81 |     # Port 2 = Port 3: Load AGU
 82 |     # Port 4 = Port 9: Store Data
 83 |     # Port 7 = Port 8: Store AGU
 84 |     # ALU(add): Port 0, 1, 5, 6
 85 |     # LEA(lea): Port 0, 1, 5, 6
 86 |     # Shift(shl): Port 0, 6
 87 |     # INT MUL(imul): Port 1
 88 |     # INT MUL(imul 64b*64b=128b): Port 1(LO) + Port 5(HI)
 89 |     # INT DIV(idiv): (Port 0, 1, 5, 6) + Port 1
 90 |     # JMP/Branch(jmp/jnz): Port 0, 6
 91 |     # Load(mov): Port 2, 3
 92 |     # Store(mov): Data(Port 4, 9) + AGU(Port 7, 8)
 93 |     # 256-bit ALU(VPADDD YMM): Port 0, 1, 5
 94 |     # 512-bit ALU(VPADDD ZMM): Port 0, 5
 95 |     # 256-bit FMA(VFMADD132PD YMM): Port 0, 1
 96 |     # 512-bit FMA(VFMADD132PD ZMM): Port 0
 97 |     # 256-bit FADD(VADDPS YMM): Port 0, 1
 98 |     # 512-bit FADD(VADDPS ZMM): Port 0
 99 |     # 256-bit FDIV(VDIVPS YMM): Port 0
100 |     # 256-bit Shuffle(VPSHUFD YMM): Port 1, 5
101 |     # 512-bit Shuffle(VPSHUFD ZMM): Port 5
102 |     # 256-bit Shift(VPSLLD YMM): Port 0, 1
103 |     # 512-bit Shift(VPSLLD YMM): Port 0
104 | 
105 |     # Source: Chips and Cheese
106 |     # Source: jiegec, 81 sched size for fp
107 |     # Source: jiegec, 40 sched size for f2i
108 |     # Source: Intel, 160 scheduler entries in total(160=80+34+23+23)
109 |     sched1: 80-entry Unified Math Scheduler
110 | 
111 |     # Source: Chips and Cheese
112 |     pipe1: Port 0 {
113 |       grid-columns: 1
114 |       ALU
115 |       LEA
116 |       Shift
117 |       Jump/Branch
118 |       512b ALU
119 |       512b FADD
120 |       512b FMA
121 |       512b Shift
122 |     }
123 |     rob -> sched1 -> rf -> pipe1
124 | 
125 |     # Source: Chips and Cheese
126 |     pipe2: Port 1 {
127 |       grid-columns: 1
128 |       ALU
129 |       LEA
130 |       INT MUL
131 |       INT DIV
132 |       256b ALU
133 |       256b FADD
134 |       256b FMA
135 |       256b Shift
136 |       256b Shuffle
137 |     }
138 |     rob -> sched1 -> rf -> pipe2
139 | 
140 |     # Source: Chips and Cheese
141 |     pipe3: Port 5 {
142 |       grid-columns: 1
143 |       ALU
144 |       LEA
145 |       INT MUL HI
146 |       512b ALU
147 |       512b Shuffle
148 |     }
149 |     rob -> sched1 -> rf -> pipe3
150 | 
151 |     # Source: Chips and Cheese
152 |     pipe4: Port 6 {
153 |       grid-columns: 1
154 |       ALU
155 |       LEA
156 |       Shift
157 |       Jump/Branch
158 |     }
159 |     rob -> sched1 -> rf -> pipe4
160 | 
161 |     # Source: Chips and Cheese
162 |     # Source: jiegec, 36 sched size for store data
163 |     sched2: 34-entry Store Data Scheduler
164 | 
165 |     # Source: Chips and Cheese
166 |     pipe5: Port 4 {
167 |       Store Data
168 |     }
169 |     rob -> sched2 -> rf -> pipe5
170 | 
171 |     # Source: Chips and Cheese
172 |     pipe6: Port 9 {
173 |       Store Data
174 |     }
175 |     rob -> sched2 -> rf -> pipe6
176 | 
177 |     # Source: Chips and Cheese
178 |     # Source: jiegec, 48(~46=23+23) sched size for load & store address
179 |     sched3: 23-entry AGU Scheduler \#1
180 | 
181 |     # Source: Chips and Cheese
182 |     pipe7: Port 2 {
183 |       Load AGU
184 |     }
185 |     rob -> sched3 -> rf -> pipe7
186 | 
187 |     # Source: Chips and Cheese
188 |     pipe8: Port 8 {
189 |       Store AGU
190 |     }
191 |     rob -> sched3 -> rf -> pipe8
192 | 
193 |     # Source: Chips and Cheese
194 |     sched4: 23-entry AGU Scheduler \#2
195 | 
196 |     # Source: Chips and Cheese
197 |     pipe9: Port 3 {
198 |       Load AGU
199 |     }
200 |     rob -> sched4 -> rf -> pipe9
201 | 
202 |     # Source: Chips and Cheese
203 |     pipe10: Port 7 {
204 |       Store AGU
205 |     }
206 |     rob -> sched4 -> rf -> pipe10
207 | 
208 |     lsu: LSU {
209 |       # Source: Chips and Cheese, Intel
210 |       128-entry Load Queue
211 |       72-entry Store Queue
212 |     }
213 | 
214 |     pipe5 -> lsu
215 |     pipe6 -> lsu
216 |     pipe7 -> lsu
217 |     pipe8 -> lsu
218 |     pipe9 -> lsu
219 |     pipe10 -> lsu
220 |   }
221 |   frontend.rename -> backend.rob
222 |   frontend.rename -> backend.bob
223 | 
224 |   mem: Memory {
225 |     l1: L1 DC {
226 |       # Source: Chips and Cheese
227 |       l1dtlb: 64-entry load, 16-entry store L1 DTLB
228 | 
229 |       # Source: Chips and Cheese, Intel
230 |       # Intel says 8-way?
231 |       l1dc: 48KB 12-way L1DC, 5 cycle latency, 64B cache line
232 | 
233 |       # Source: Chips and Cheese
234 |       mshr: 12-entry MSHR
235 |     }
236 | 
237 |     l2: L2 {
238 |       # Source: Chips and Cheese, Intel
239 |       l2tlb: 2048-entry L2 TLB
240 | 
241 |       # Source: Chips and Cheese
242 |       l2dc: 512 KB 8-way/1280 KB 20-way L2 DC, 13 cycle latency
243 | 
244 |       # Source: Chips and Cheese
245 |       mshr: 32-entry MSHR
246 |     }
247 | 
248 |     # Source: Chips and Cheese
249 |     l1 -> l2: 64B/cycle
250 | 
251 |     l3: L3 {
252 |       # Source: Chips and Cheese
253 |       l3dc: up to 2MB/core 16-way L3 Cache
254 |     }
255 |     l2 -> l3
256 |   }
257 |   frontend.l1ic -> mem.l2
258 |   backend.lsu -> mem.l1
259 | 
260 |   info: |md
261 |     Drawn by Jiajie Chen @jiegec
262 | 
263 |     Based on data from Chips and Cheese, Intel
264 |   |
265 | }


--------------------------------------------------------------------------------
/docs/sunny_cove.md:
--------------------------------------------------------------------------------
 1 | # Intel Ice Lake aka Sunny Cove
 2 | 
 3 | ![](./sunny_cove.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Sunny Cove: Intel’s Lost Generation](https://chipsandcheese.com/2022/06/07/sunny-cove-intels-lost-generation/)
 8 | - [Popping the Hood on Golden Cove](https://chipsandcheese.com/2021/12/02/popping-the-hood-on-golden-cove/)
 9 | - [Sunny Cove - Microarchitectures - Intel](https://en.wikichip.org/wiki/intel/microarchitectures/sunny_cove)
10 | - [Golden Cove’s Vector Register File: Checking with Official (SPR) Data](https://chipsandcheese.com/2023/01/15/golden-coves-vector-register-file-checking-with-official-spr-data/)
11 | - [4th Gen Intel Xeon Scalable Sapphire Rapids Leaps Forward](https://www.servethehome.com/4th-gen-intel-xeon-scalable-sapphire-rapids-leaps-forward/7/)
12 | - Intel 64 and IA-32 Architectures Optimization Reference Manual Volume 1
13 | - [Built for the Edge: The Next-Generation Intel® Xeon D 2700 & 1700 processors](https://hc34.hotchips.org/assets/program/conference/day2/Mobile%20and%20Edge/HC2022.XeonDx700.PraveenMosur.FINAL.pdf)
14 | 


--------------------------------------------------------------------------------
/docs/uarch.csv:
--------------------------------------------------------------------------------
 1 | ﻿uArch,L1 BTB,L2 BTB,L3 BTB,ITA,RAS,L1 ITLB,L2 ITLB,L1 DTLB,L2 DTLB,L2 Unified TLB,L1 IC,Decode width,UOP/MOP Cache,UOP/MOP width,Rename width,ROB,Int RF,Flag RF,Float RF,Load to use latency,Load to FP use latency,Branch units,ALU units,FP/Vec units,DP FLOP/cycle,Load/Store pipes,Load-only pipes,Store-only pipes
 2 | AMD Zen 1,8 (0 bubble),256 (1 bubble),4096 (4 bubble),512,32,64,512,64,1536,,4-way 64KB,4,8-way 256-entry 8-inst/entry,8,6+4,192,168,,160,4,7,2,4,4x 128b,8,0,2,1
 3 | AMD Zen 2,16 (0 bubble),512 (1 bubble),7168 (4 bubble),1024,32,64,512,64,2048,,8-way 32KB,4,8-way 512-entry 8-inst/entry,8,6+4,224,180,138,160,4,7,2,4,4x 256b,16,0,2,1
 4 | AMD Zen 3,1024 (0 bubble),6656 (3 bubble),,1536,2x32,64,512,64,2048,,8-way 32KB,4,8-way 512-entry 8-mop/entry,8,6,256,192,122,160,4,7,2,4,4x 256b,16,2,1,0
 5 | AMD Zen 4,1536 (0 bubble),7168/7680 (3 bubble),,3072,2x32,64,512,72,3072,,8-way 32KB,4,12-way 768-entry 9-mop/entry,9,6,320,224,126,192,4,7,2,4,4x 256b,16,2,1,0
 6 | AMD Zen 5,16384 (0 bubble),8192 (8 bubble),,3072,2x52,64,2048,96,4096,,8-way 32KB,2x4,16-way 1024-entry 6-inst/entry,2x6,8,448,240,192,384,4,7,3,6,4x 512b,32,2,2,0
 7 | Ampere One,256 (0 bubble),8192 (2 bubble),,,,64,768,64,1536,,4-way 16KB,5,,,4,208,166,,128,,,2,4,2,,0,2,2
 8 | Apple Avalanche,1024 (0 bubble),3072 (1 bubble),192KB L1 IC (2 bubble),,50,192,,,,,192KB,8,N/A,N/A,8,274 Group,350,,380,3-4,,2,6,4x 128b,16,1,2,1
 9 | Apple Firestorm,1024 (0 bubble),192KB L1 IC (2 bubble),,,50,192,,160,,3072,6-way 192KB,8,N/A,N/A,8,330 Group,380,128,432,3-4,,2,6,4x 128b,16,1,2,1
10 | Apple Icestorm,,,,,32,128,,128,,,128KB,4,N/A,N/A,4,,,,,3-4,,2,4,2x 128b,8,1,0,1
11 | ARM Cortex-A77,64 (0 bubble),8192,,,,48,,48,,1280,4-way 64KB,6,1536-entry,6,6,160 MOP,,,,4,5,2,4,2,,2,0,0
12 | ARM Cortex-X1,96 (0 bubble),8192 (1 bubble),,,16,48,,40,,2048,4-way 64KB,5,4-way 3072-entry,8,8,224 MOP,,,,4,6,2,4,4,8,2,1,0
13 | ARM Cortex-X2,,,,,,48,,48,,2048,4-way 64KB,5,4-way 3072-entry,8,8,288 MOP,,,,4,6,2,4,4,,2,1,0
14 | ARM Cortex-X3,,,,,32,48,,,,,4-way 64KB,6,4-way 1536-entry,8,8,320 MOP,,,,4,6,2,6,4,,2,1,0
15 | ARM Cortex-X4,,,,,32,48,,96,,2048,4-way 64KB,10,N/A,N/A,10,384 MOP,,,,,,3,8,4,,1,2,1
16 | ARM Cortex-X925,,,,,32,128,,96,,2048,4-way 64KB,10,N/A,N/A,10,?,,,,4,6,3,8,6,,2,2,0
17 | Intel Crestmont,1024 (0 bubble),6144 (2 bubble),,,,64,,48,,3072,8-way 64KB,2x3,,,6,256,214,,207,3-4,,2,4,3,,0,2,2
18 | Intel Golden Cove,128 (0 bubble),6144 (1 bubble),12288 (2 bubble),,20,256,,96,,2048,8-way 32KB,6,4096-entry,8,6,512,288,248,332,5,,2,5,3,,0,3,2
19 | Intel Gracemont,1024 (0 bubble),5120 (2 bubble),,,,64,,32,,2048,8-way 64KB,2x3,,,5,256,214,,207,3-4,,2,4,3x 128b,8,0,2,2
20 | Intel Lion Cove,,,,,,,,128,,,,8,,12,8,576,,,,4,,3,6,4x 256b,,?,?,?
21 | Intel Redwood Cove,128 (0 bubble),2x6144 (1 bubble),,,,,,96,,,64KB,6,,8,6,512,,,,5,,2,5,3x 256b,,?,?,?
22 | Intel Skymont,,,,,,128,,,,,64KB,3x3,,3x3,8,416,,,,,,3,8,4x 128b,,?,?,?
23 | Intel Sunny Cove,256 (0 bubble),5120 (1 bubble),,,22,128,,64,,2048,8-way 32KB,4,2304-entry,,5,352,280,238,224,5,,2,4,2x 512b,32,0,2,2
24 | Qualcomm Oryon,2048 (0 bubble),192KB L1 IC (2 bubble),,,50,256,,224,,8192,6-way 192KB,8,N/A,N/A,8,680,416,,416,3-4,,2,6,4x 128b,16,2,2,0
25 | 


--------------------------------------------------------------------------------
/docs/xiaomi.d2:
--------------------------------------------------------------------------------
  1 | cpu : Phytium Xiaomi CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Phytium
  5 |       # "2048-entry BTB"
  6 |       l1btb: 2048-entry BTB
  7 | 
  8 |       # Source: Phytium
  9 |       # "512-entry indirect predictor"
 10 |       indir: 512-entry Indirect Predictor
 11 | 
 12 |       # Source: Phytium
 13 |       # "48-entry Speculative Return Stac"
 14 |       ras: 48-entry Return Address Stack
 15 | 
 16 |       # Source: Phytium
 17 |       # "Direction predict with TAGE predictor"
 18 |       tage: TAGE-based direction predictor
 19 |     }
 20 | 
 21 |     l1ic: L1 IC {
 22 |       # Source: Phytium
 23 |       # "32KB L1 instr. Cach"
 24 |       l1ic: 32KB L1 IC
 25 |     }
 26 | 
 27 |     # Source: Phytium
 28 |     # "Loop detect and Instr. Cache bypass"
 29 |     loop: Loop Buffer
 30 | 
 31 |     fq: Fetch Queue
 32 |     bp -> fq
 33 |     fq -> l1ic
 34 | 
 35 |     # Source: Phytium
 36 |     # "32-entry instruction buffer"
 37 |     iq: 32-entry Instruction Buffer
 38 | 
 39 |     loop -> iq
 40 |     l1ic -> iq
 41 | 
 42 |     # Source: Phytium
 43 |     # "Up to four instructions decoded per cycle"
 44 |     decode: 4-way Decode
 45 |     iq -> decode
 46 | 
 47 |     # Source: Phytium
 48 |     # "Up to four instructions dispatched per cycle"
 49 |     dispatch: 4-way Dispatch
 50 |     decode -> dispatch
 51 |   }
 52 | 
 53 |   backend: Backend {
 54 |     # Source: Phytium
 55 |     # "Up to four instructions renamed per cycle"
 56 |     rename: Rename
 57 | 
 58 |     # Source: Phytium
 59 |     # "Reorder buffer can hold 160 instructions"
 60 |     rob: 160-entry ROB
 61 |     rename -> rob
 62 | 
 63 |     rf: Register File {
 64 |       # Source: Phytium
 65 |       # "192 physical registers"
 66 |       irf: 192-entry Integer Register File
 67 | 
 68 |       vrf: Vector Register File
 69 |     }
 70 | 
 71 |     # Source: Phytium
 72 |     # "Two separated 16-entry integer and ASIMD queues shared by four integer units"
 73 |     sched1: 16-entry Single-cycle ALU Scheduler
 74 | 
 75 |     # Source: Phytium
 76 |     pipe1: Pipe \#1 {
 77 |       ALU
 78 |       BR
 79 |     }
 80 |     sched1 -> rf -> pipe1
 81 | 
 82 |     # Source: Phytium
 83 |     pipe2: Pipe \#2 {
 84 |       ALU
 85 |       BR
 86 |     }
 87 |     sched1 -> rf -> pipe2
 88 | 
 89 |     # Source: Phytium
 90 |     # "Two separated 16-entry integer and ASIMD queues shared by four integer units"
 91 |     sched2: 16-entry Multi-cycle ALU Scheduler
 92 | 
 93 |     # Source: Phytium
 94 |     pipe3: Pipe \#3 {
 95 |       MUL
 96 |       DIV
 97 |     }
 98 |     sched2 -> rf -> pipe3
 99 | 
100 |     # Source: Phytium
101 |     pipe4: Pipe \#4 {
102 |       MUL
103 |       DIV
104 |     }
105 |     sched2 -> rf -> pipe4
106 | 
107 |     # Source: Phytium
108 |     # "One shared 16-entry floating point and ASIMD queue"
109 |     sched3: 16-entry FP/Vector Scheduler
110 | 
111 |     # Source: Phytium
112 |     pipe5: Pipe \#5 {
113 |       FMAC
114 |       FDIV
115 |     }
116 |     sched3 -> rf -> pipe5
117 | 
118 |     # Source: Phytium
119 |     pipe6: Pipe \#6 {
120 |       FMAC
121 |       FDIV
122 |     }
123 |     sched3 -> rf -> pipe6
124 | 
125 |     # Source: Phytium
126 |     sched4: AGU Scheduler
127 | 
128 |     lsu: LSU {
129 |       # Source: Phytium
130 |       # "One 24-entry load/store queue"
131 |       24-entry Load/Store Queue
132 | 
133 |       # Source: Phytium
134 |       # "4 cycles latency from load to use"
135 |       4 cycle load to use latency
136 |     }
137 | 
138 |     sched4 -> rf -> lsu
139 | 
140 |     rob -> sched1
141 |     rob -> sched2
142 |     rob -> sched3
143 |     rob -> sched4
144 |   }
145 |   frontend.dispatch -> backend.rename
146 | 
147 |   mem: Memory {
148 |     # Source: Phytium
149 |     # "32KB L1 data cache"
150 |     l1dc: 32KB L1DC {
151 |       # Source: Phytium
152 |       6 outstanding loads
153 |     }
154 | 
155 |     l2: L2 Cache
156 |     l1dc -> l2
157 |   }
158 |   frontend.l1ic -> mem.l2
159 |   backend.lsu -> mem.l1dc
160 | 
161 |   info: |md
162 |     Drawn by Jiajie Chen @jiegec
163 | 
164 |     Based on data from Phytium
165 |   |
166 | }


--------------------------------------------------------------------------------
/docs/xiaomi.md:
--------------------------------------------------------------------------------
1 | # Phytium Xiaomi
2 | 
3 | ![](./xiaomi.svg)
4 | 
5 | References:
6 | 
7 | - [Mars: A 64-core ARMv8 Processor](https://old.hotchips.org/wp-content/uploads/hc_archives/hc27/HC27.24-Monday-Epub/HC27.24.30-HP-Cloud-Comm-Epub/HC27.24.321-64core-Zhang-phytium-v1.0.pdf)


--------------------------------------------------------------------------------
/docs/zen1.d2:
--------------------------------------------------------------------------------
  1 | cpu: AMD Zen1 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: AMD
  5 |       # "L0BTB holds 4 forward taken branches and 4 backward taken branches, and
  6 |       # predicts with zero bubbles. L1BTB has 256 entries and creates one bubble
  7 |       # if prediction differs from L0BTB. L2BTB has 4096 entries and creates
  8 |       # four bubbles if its prediction differs from L1BTB."
  9 |       l1btb: 8-entry(4 forward, 4 backward) L1 BTB, zero bubbles
 10 | 
 11 |       # Source: AMD
 12 |       l2btb: 256-entry L2 BTB, one bubble
 13 | 
 14 |       # Source: AMD
 15 |       l3btb: 4096-entry L3 BTB, four bubbles
 16 | 
 17 |       # Source: AMD
 18 |       # "The processor implements a 512-entry indirect target array"
 19 |       indir: 512-entry Indirect Target Array
 20 | 
 21 |       # Source: AMD
 22 |       # "The processor implements a 32-entry return address stack (RAS) to
 23 |       # predict return addresses from a near call."
 24 |       ras: 32-entry Return Address Stack
 25 |     }
 26 | 
 27 |     l1ic: L1 IC {
 28 |       # Source: AMD
 29 |       # "The processor contains a fully-associative L1 instruction TLB (ITLB)
 30 |       # with 64 entries that can hold 4- Kbyte, 2-Mbyte, or 1-Gbyte page
 31 |       # entries."
 32 |       l1itlb: 64-entry L1 ITLB
 33 | 
 34 |       # Source: AMD
 35 |       # "The processor provides an 8-way set associative L2 instruction TLB with
 36 |       # 512 entries capable of holding 4-Kbyte pages, and 2-Mbyte pages. 1-Gbyte
 37 |       # pages are not held in the L2 instruction TLB; they are smashed into
 38 |       # 2-Mbyte pages in the L2 ITLB."
 39 |       l2itlb: 512-entry 8-way L2 ITLB
 40 | 
 41 |       # Source: AMD
 42 |       # "The AMD Family 17h processor contains a 64-Kbyte, 4-way set associative
 43 |       # L1 instruction cache."
 44 |       l1ic: 64KB 4-way L1 IC
 45 |     }
 46 | 
 47 |     fq: Fetch Queue
 48 |     bp -> fq
 49 |     fq -> l1ic
 50 | 
 51 |     # Source: AMD
 52 |     # "The fetch unit sends these bytes to the decode unit through a 20 entry
 53 |     # Instruction Byte Queue (IBQ), each entry holding 16 instruction bytes. In
 54 |     # SMT mode each thread has 10 dedicated IBQ entries"
 55 |     iq: 20x 16B Instruction Byte Queue
 56 |     # Source: AMD
 57 |     # "The AMD Family 17h processor fetches instructions from the instruction
 58 |     # cache in 32-byte naturally aligned blocks. The processor can perform an
 59 |     # instruction block fetch every cycle."
 60 |     l1ic -> iq: 32 B/cyc
 61 | 
 62 |     # Source: AMD
 63 |     # "The decode unit scans two of these windows in a given cycle, decoding a
 64 |     # maximum of four instructions."
 65 |     decode: 4-way Decode
 66 |     iq -> decode: 2 IBQ entry
 67 | 
 68 |     # Source: AMD
 69 |     # "The op cache is organized as an associative cache with 32 sets and 8
 70 |     # ways. At each set-way intersection is an entry containing up to 8
 71 |     # instructions, so the maximum capacity of the op cache is then 2K
 72 |     # instructions."
 73 |     uopc: 256-entry 8-way, 8 inst/entry UOP Cache
 74 |     decode -> uopc
 75 |     bp -> uopc
 76 | 
 77 |     # Source: AMD
 78 |     # "the maximum throughput from the op cache is 8 instructions per cycle
 79 |     # whereas the maximum throughput from the traditional fetch and decode
 80 |     # pipeline is 4 instructions per cycle."
 81 |     uop: UOP Queue
 82 |     uopc -> uop: 8 inst/cycle
 83 |     decode -> uop: 4 inst/cycle
 84 | 
 85 |     rename: Rename {
 86 |       Move Elimination
 87 |       Zero Idiom
 88 |     }
 89 |     uop -> rename
 90 |   }
 91 | 
 92 |   backend: Backend {
 93 |     # Source: AMD
 94 |     # "The unit can receive up to 6 macro ops dispatched per cycle and track up
 95 |     # to 192 macro ops in-flight. "
 96 |     # "The retire unit handles in-order commit of up to eight macro ops per
 97 |     # cycle."
 98 |     rob: 192-entry ROB, retire 8 op/cycle
 99 | 
100 |     rf: Register File {
101 |       # Source: AMD
102 |       # "The integer physical register file (PRF) consists of 168 registers,
103 |       # with up to 38 per thread mapped to architectural state or
104 |       # microarchitectural temporary state."
105 |       irf: 168-entry Integer Register File
106 | 
107 |       # Source: AMD
108 |       # 160 entry Physical Register File in Figure 6
109 |       vrf: 160-entry FP/Vector Register File
110 |     }
111 | 
112 |     # Source: AMD
113 |     # "ALU micro ops are sent to one of four 14-entry ALU schedulers"
114 |     sched1: 14-entry ALU Scheduler \#1
115 | 
116 |     pipe1: Pipe \#1 {
117 |       ALU
118 |     }
119 |     sched1 -> rf -> pipe1
120 | 
121 |     # Source: AMD
122 |     sched2: 14-entry ALU Scheduler \#2
123 | 
124 |     pipe2: Pipe \#2 {
125 |       ALU
126 |     }
127 |     sched2 -> rf -> pipe2
128 | 
129 |     # Source: AMD
130 |     sched3: 14-entry ALU Scheduler \#3
131 | 
132 |     pipe3: Pipe \#3 {
133 |       ALU
134 |     }
135 |     sched3 -> rf -> pipe3
136 | 
137 |     # Source: AMD
138 |     sched4: 14-entry ALU Scheduler \#4
139 | 
140 |     # Source: g
141 |     pipe4: Pipe \#4 {
142 |       ALU
143 |     }
144 |     sched4 -> rf -> pipe4
145 | 
146 |     # Source: AMD
147 |     # "Load and Store micro ops are sent to one of two 14-entry address
148 |     # generation units (AGUs) Each scheduler can issue one micro op per cycle."
149 |     sched5: 14-entry AGU Scheduler \#5
150 | 
151 |     # Source: g
152 |     pipe5: Pipe \#5 {
153 |       Load AGU
154 |       Store AGU
155 |     }
156 |     sched5 -> rf -> pipe5
157 | 
158 |     sched6: 14-entry AGU Scheduler \#6
159 | 
160 |     pipe6: Pipe \#6 {
161 |       Load AGU
162 |       Store AGU
163 |     }
164 |     sched6 -> rf -> pipe6
165 | 
166 |     lsu: LSU {
167 |       # Source: AMD
168 |       # "The LS unit includes a 44-entry load queue (LDQ)."
169 |       44-entry Load Queue
170 |       # Source: AMD
171 |       # "The LS unit utilizes a 44-entry store queue "
172 |       44-entry Store Queue
173 | 
174 |       # Source: AMD
175 |       # "The LS unit contains three largely independent pipelines enabling the
176 |       # execution of two 128-bit load memory operations and one 128-bit store
177 |       # memory operation per cycle."
178 |       2x 128b Load Pipe
179 |       1x 128b Store Pipe
180 | 
181 |       # Source: jiegec, AMD
182 |       # AMD: "4-cycle load-to-use integer load latency and 7-cycle load-to-use FP load latency"
183 |       4 cycle load to use latency
184 |       7 cycle load to FP use latency
185 |     }
186 | 
187 |     pipe5 -> lsu
188 |     pipe6 -> lsu
189 | 
190 |     rob -> sched1
191 |     rob -> sched2
192 |     rob -> sched3
193 |     rob -> sched4
194 |     rob -> sched5
195 |     rob -> sched6
196 | 
197 |     # Source: AMD
198 |     # "The floating-point scheduler has a 36 entry micro-op capacity"
199 |     sched7: 36-entry Vector/FP Scheduler \#7
200 |     rob -> sched7
201 | 
202 |     # Source: AMD
203 |     pipe7: Pipe \#7 {
204 |       FMUL
205 |       FMISC
206 |       VADD
207 |       VMUL
208 |       VMISC
209 |       AES
210 |     }
211 |     sched7 -> rf -> pipe7
212 | 
213 |     # Source: AMD
214 |     pipe8: Pipe \#8 {
215 |       FMUL
216 |       FMISC
217 |       VADD
218 |       VSHUF
219 |       VMISC
220 |       AES
221 |       CLM
222 |     }
223 |     sched7 -> rf -> pipe8
224 | 
225 |     # Source: AMD
226 |     pipe9: Pipe \#9 {
227 |       FADD
228 |       FMISC
229 |       STORE
230 |       VSHUF
231 |       VSHIFT
232 |       VMISC
233 |     }
234 |     sched7 -> rf -> pipe9 -> lsu
235 | 
236 |     # Source: AMD
237 |     pipe10: Pipe \#10 {
238 |       FADD
239 |       FCVT
240 |       FDIV
241 |       FMISC
242 |       VADD
243 |       VMISC
244 |     }
245 |     sched7 -> rf -> pipe10
246 |   }
247 |   frontend.rename -> backend.rob
248 | 
249 |   mem: Memory {
250 |     l1: L1 DC {
251 |       # Source: AMD
252 |       # "The fully-associative L1 data TLB (DTLB) provides 64 entries that hold
253 |       # 4-Kbyte, 2-Mbyte, or 1- Gbyte page entries."
254 |       l1dtlb: 64-entry L1 DTLB
255 | 
256 |       # Source: AMD
257 |       # "The L2 data TLB provides a unified 12-way set-associative L2 data TLB
258 |       # with 1536 entries"
259 |       l2dtlb: 1536-entry 12-way L2 DTLB
260 | 
261 |       # Source: AMD
262 |       # "The AMD Family 17h processor contains a 32-Kbyte, 8-way set associative
263 |       # L1 data cache"
264 |       l1dc: 32KB 8-way L1DC
265 | 
266 |       # Source: AMD
267 |       # "A hardware table walker loads page table information into the TLBs."
268 |       ptw: 1 Page Table Walker
269 |     }
270 | 
271 |     l2: L2 {
272 |       # Source: AMD
273 |       # "The AMD Family 17h processor implements a unified 8-way set associative
274 |       # write-back L2 cache per core. This on-die L2 cache is inclusive of the
275 |       # L1 caches in the core. The L2 cache size is 512 Kbytes with a variable
276 |       # load-to-use latency of no less than 12 cycles."
277 |       l2dc: 512KB 8-way L2 Cache
278 |     }
279 | 
280 |     # Source: AMD
281 |     # "The L2 to L1 data path is 32 bytes wide."
282 |     l1 -> l2: 32B/cycle
283 | 
284 |     l3: L3 {
285 |       # Source: AMD
286 |       # "The AMD Family 17h processor implements a 4 MB or 8-MB L3 cache
287 |       # (depending on SOC configuration) that is 16-way set associative and
288 |       # shared by four cores inside a CPU complex."
289 |       l3dc: 4MB/8MB 16-way L3 Cache
290 |     }
291 |     l2 -> l3
292 |   }
293 |   frontend.l1ic -> mem.l2
294 |   backend.lsu -> mem.l1
295 | 
296 |   info: |md
297 |     Drawn by Jiajie Chen @jiegec
298 | 
299 |     Based on data from AMD
300 |   |
301 | }
302 | 


--------------------------------------------------------------------------------
/docs/zen1.md:
--------------------------------------------------------------------------------
1 | # AMD Zen1
2 | 
3 | ![](./zen1.svg)
4 | 
5 | References:
6 | 
7 | - Software Optimization Guide for AMD Family 17h Processors
8 | - [The AMD Zen and Ryzen 7 Review: A Deep Dive on 1800X, 1700X and 1700](https://www.anandtech.com/show/11170/the-amd-zen-and-ryzen-7-review-a-deep-dive-on-1800x-1700x-and-1700/4)
9 | 


--------------------------------------------------------------------------------
/docs/zen2.md:
--------------------------------------------------------------------------------
 1 | # AMD Zen2
 2 | 
 3 | ![](./zen2.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [Deep Diving Neoverse N1](https://chipsandcheese.com/2021/10/22/deep-diving-neoverse-n1/)
 8 | - [AMD Zen 2 Microarchitecture Analysis: Ryzen 3000 and EPYC Rome](https://www.anandtech.com/show/14525/amd-zen-2-microarchitecture-analysis-ryzen-3000-and-epyc-rome/6)
 9 | - [AMD Zen 3 Ryzen Deep Dive Review: 5950X, 5900X, 5800X and 5600X Tested](https://www.anandtech.com/show/16214/amd-zen-3-ryzen-deep-dive-review-5950x-5900x-5800x-and-5700x-tested/4)
10 | - [X86SchedulerZnver2.td in LLVM](https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/X86/X86ScheduleZnver2.td)
11 | - Software Optimization Guide for AMD Family 17h Models 30h and Greater Processors
12 | - [Zen 2 Dieshot](https://www.bilibili.com/video/BV1Af421i7jY/)
13 | 


--------------------------------------------------------------------------------
/docs/zen3.d2:
--------------------------------------------------------------------------------
  1 | cpu : AMD Zen3 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Chips and Cheese, AMD
  5 |       # "L1BTB has 1024 entries and predicts with zero bubbles for conditional
  6 |       # and unconditional direct branches, and one cycle for calls, returns and
  7 |       # indirect branches. L2BTB has 6656 entries and creates three bubbles if
  8 |       # its prediction differs from L1BTB."
  9 |       l1btb: 1024-entry L1 BTB, zero bubbles
 10 | 
 11 |       # Source: Chips and Cheese, AMD
 12 |       l2btb: 6656-entry L2 BTB, three bubbles
 13 | 
 14 |       # Source: Chips and Cheese, AMD
 15 |       # "The processor implements a 1536-entry indirect target array"
 16 |       indir: 1536-entry Indirect Target Array
 17 | 
 18 |       # Source: Chips and Cheese, AMD
 19 |       # "The processor implements a 32-entry return address stack (RAS) per thread"
 20 |       ras: 32-entry Return Address Stack
 21 | 
 22 |       # Source: AMD
 23 |       penalty: 11-18 cycle branch misprediction latency, 13 typical
 24 |     }
 25 | 
 26 |     l1ic: L1 IC {
 27 |       # Source: Chips and Cheese, AMD
 28 |       l1itlb: 64-entry L1 ITLB
 29 | 
 30 |       # Source: Chips and Cheese, AMD
 31 |       l2itlb: 512-entry L2 ITLB
 32 | 
 33 |       # Source: Chips and Cheese, AMD
 34 |       l1ic: 32KB 8-way L1 IC
 35 |     }
 36 | 
 37 |     # Source: Chips and Cheese
 38 |     fq: 64-entry Fetch Queue
 39 |     bp -> fq
 40 |     fq -> l1ic
 41 | 
 42 |     # Source: AMD
 43 |     # "The fetch unit sends these bytes to the decode unit through a 24 entry
 44 |     # Instruction Byte Queue (IBQ), each entry holding 16 instruction bytes. In
 45 |     # SMT mode each thread has 12 dedicated IBQ entries."
 46 |     iq: 24x 16B Instruction Byte Queue
 47 | 
 48 |     # Source: AMD
 49 |     # "The processor fetches instructions from the instruction cache in 32-byte
 50 |     # blocks that are 16-byte aligned and contained within a 64-byte aligned
 51 |     # block. The processor can perform such a 32-byte fetch every cycle."
 52 |     l1ic -> iq: 32B/cycle
 53 | 
 54 |     # Source: Chips and Cheese, AMD
 55 |     # AMD: "The decode unit scans two of these IBQ entries in a given cycle,
 56 |     # decoding a maximum of four instructions."
 57 |     decode: 4-way Decode
 58 |     iq -> decode: 2 IBQ entry
 59 | 
 60 |     # Source: Chips and Cheese, AMD
 61 |     # "The op cache is organized as an associative cache with 64 sets and 8
 62 |     # ways. At each set-way intersection is an entry containing up to 8 macro
 63 |     # ops."
 64 |     uopc: 512-entry 8-way, 8 mop/entry UOP Cache
 65 |     decode -> uopc
 66 |     bp -> uopc
 67 | 
 68 |     # Source: Chips and Cheese
 69 |     uop: UOP Queue
 70 |     uopc -> uop: 8 macro ops/cycle
 71 |     decode -> uop: 4 instructions/cycle
 72 | 
 73 |     # Source: Chips and Cheese
 74 |     rename: 6-way Rename {
 75 |       Move Elimination
 76 |       Zero Idiom
 77 |     }
 78 |     uop -> rename
 79 |   }
 80 | 
 81 |   backend: Backend {
 82 |     # Source: Chips and Cheese, AMD
 83 |     # "The unit can receive up to 6 macro ops dispatched per cycle and track up
 84 |     # to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode."
 85 |     rob: 256-entry ROB
 86 | 
 87 |     # Source: Chips and Cheese
 88 |     bob: 48-taken-entry 117-not-taken-entry Branch Order Buffer
 89 | 
 90 |     rf: Register File {
 91 |       # Source: Chips and Cheese, AMD
 92 |       # "The integer physical register file (PRF) consists of 192 registers,
 93 |       # with up to 38 per thread mapped to architectural state or
 94 |       # micro-architectural temporary state."
 95 |       irf: 192-entry Integer Register File
 96 | 
 97 |       # Source: Chips and Cheese
 98 |       flagsrf: 122-entry Flags Register File
 99 | 
100 |       # Source: Chips and Cheese, AMD
101 |       vrf: 160-entry 256b Vector Register File
102 |     }
103 | 
104 |     # Source: Chips and Cheese, AMD
105 |     # Source: AMD, 96 integer scheduler entries
106 |     sched1: 24-entry Scheduler \#1
107 | 
108 |     # Source: Chips and Cheese
109 |     pipe1: Pipe \#1 {
110 |       ALU
111 |       CMOV
112 |     }
113 |     sched1 -> rf -> pipe1
114 | 
115 |     # Source: Chips and Cheese
116 |     pipe2: Pipe \#2 {
117 |       Branch
118 |     }
119 |     sched1 -> rf -> pipe2
120 | 
121 |     # Source: Chips and Cheese, AMD
122 |     sched2: 24-entry Scheduler \#2
123 | 
124 |     # Source: Chips and Cheese
125 |     pipe3: Pipe \#3 {
126 |       ALU
127 |     }
128 |     sched2 -> rf -> pipe3
129 | 
130 |     # Source: Chips and Cheese
131 |     pipe4: Pipe \#4 {
132 |       AGU
133 |     }
134 |     sched2 -> rf -> pipe4
135 | 
136 |     # Source: Chips and Cheese, AMD
137 |     sched3: 24-entry Scheduler \#3
138 | 
139 |     # Source: Chips and Cheese
140 |     pipe5: Pipe \#5 {
141 |       ALU
142 |       INT MUL
143 |       PDEP
144 |       CRC
145 |     }
146 |     sched3 -> rf -> pipe5
147 | 
148 |     # Source: Chips and Cheese
149 |     pipe6: Pipe \#6 {
150 |       AGU
151 |     }
152 |     sched3 -> rf -> pipe6
153 | 
154 |     # Source: Chips and Cheese, AMD
155 |     sched4: 24-entry Scheduler \#4
156 | 
157 |     # Source: Chips and Cheese
158 |     pipe7: Pipe \#7 {
159 |       ALU
160 |       INT DIV
161 |       CMOV
162 |       Branch
163 |     }
164 |     sched4 -> rf -> pipe7
165 | 
166 |     # Source: Chips and Cheese
167 |     pipe8: Pipe \#8 {
168 |       AGU
169 |     }
170 |     sched4 -> rf -> pipe8
171 | 
172 |     lsu: LSU {
173 |       # Source: Chips and Cheese, AMD
174 |       # 116-entry Load Queue reported by Chips and Cheese
175 |       # 72-entry Load Queue reported by AMD
176 |       # "The LS unit can process up to 72 out-of-order loads."
177 |       # "The LS unit utilizes a 64-entry store queue (STQ)."
178 |       72-entry Load Queue
179 |       64-entry Store Queue
180 | 
181 |       # Source: AMD
182 |       # 3 loads per cycle (max 2 if 256b)
183 |       # 2 stores per cycle (max 1 if 256b)
184 |       # Max 3 total memory ops
185 |       # "The LS unit contains three largely independent pipelines enabling the
186 |       # execution of three 256-bit memory operations per cycle. All three memory
187 |       # operations can be loads, with a separate maximum of two 128- or
188 |       # 256-bit loads. A maximum of two of the memory operations can be stores,
189 |       # with a maximum of one store if the store is a 128- or 256-bit store."
190 |       3 Load per cycle (max 2 if 128b/256b)
191 |       2 Store per cycle (max 1 if 128b/256b)
192 | 
193 |       # Source: jiegec
194 |       # Source: AMD
195 |       # "4-cycle load-to-use integer load latency and 7-cycle load-to-use FP load latency."
196 |       4 cycle load to use latency
197 |       7 cycle load to FP use latency
198 |     }
199 | 
200 |     pipe4 -> lsu
201 |     pipe6 -> lsu
202 |     pipe8 -> lsu
203 | 
204 |     rob -> sched1
205 |     rob -> sched2
206 |     rob -> sched3
207 |     rob -> sched4
208 | 
209 |     # Source: Chips and Cheese, AMD
210 |     nsq: 64-entry Non/Pre-Scheduling Queue
211 |     rob -> nsq
212 | 
213 |     # Source: Chips and Cheese, AMD
214 |     sched5: 32-entry Scheduler \#5
215 |     nsq -> sched5
216 | 
217 |     # Source: Chips and Cheese
218 |     pipe9: Pipe \#9 {
219 |       FMA
220 |       FMisc
221 |       INT Vec ALU
222 |       INT Vec MUL
223 |       AES
224 |     }
225 |     sched5 -> rf -> pipe9
226 | 
227 |     # Source: Chips and Cheese
228 |     pipe10: Pipe \#10 {
229 |       FADD
230 |       FMisc
231 |       INT Vec ALU
232 |       Shuffle
233 |       Convert
234 |     }
235 |     sched5 -> rf -> pipe10
236 | 
237 |     # Source: Chips and Cheese
238 |     pipe11: Pipe \#11 {
239 |       FStore
240 |       F2I
241 |     }
242 |     sched5 -> rf -> pipe11 -> lsu
243 | 
244 |     # Source: Chips and Cheese, AMD
245 |     sched6: 32-entry Scheduler \#6
246 |     nsq -> sched6
247 | 
248 |     # Source: Chips and Cheese
249 |     pipe12: Pipe \#12 {
250 |       FMA
251 |       FMisc
252 |       INT Vec ALU
253 |       Shuffle
254 |       AES
255 |       Vec Shift
256 |       FDIV
257 |     }
258 |     sched6 -> rf -> pipe12
259 | 
260 |     # Source: Chips and Cheese
261 |     pipe13: Pipe \#13 {
262 |       FADD
263 |       FMisc
264 |       INT Vec ALU
265 |       INT Vec MUL
266 |     }
267 |     sched6 -> rf -> pipe13
268 | 
269 |     # Source: Chips and Cheese, AMD
270 |     pipe14: Pipe \#14 {
271 |       # FStore + F2I reported by Chips and Cheese
272 |       # FStore reported by AMD
273 |       FStore
274 |     }
275 |     sched6 -> rf -> pipe14 -> lsu
276 |   }
277 |   frontend.rename -> backend.rob
278 |   frontend.rename -> backend.bob
279 | 
280 |   mem: Memory {
281 |     l1: L1 DC {
282 |       # Source: Chips and Cheese, AMD
283 |       l1dtlb: 64-entry L1 DTLB
284 | 
285 |       # Source: Chips and Cheese, AMD
286 |       l2dtlb: 2048-entry L2 DTLB
287 | 
288 |       # Source: Chips and Cheese, AMD
289 |       l1dc: 32KB 8-way L1DC
290 | 
291 |       # Source: AMD
292 |       ptw: 6 Page Table Walkers
293 |     }
294 | 
295 |     l2: L2 {
296 |       # Source: Chips and Cheese, AMD
297 |       l2dc: 512KB 8-way L2 Cache, 12 cycle latency
298 |     }
299 | 
300 |     # Source: Chips and Cheese, AMD
301 |     l1 -> l2: 32B/cycle
302 | 
303 |     l3: L3 {
304 |       # Source: Chips and Cheese, AMD
305 |       l3dc: 32MB (4MB/core) 16-way L3 Cache, 46 cycle latency
306 |     }
307 |     l2 -> l3
308 |   }
309 |   frontend.l1ic -> mem.l2
310 |   backend.lsu -> mem.l1
311 | 
312 |   info: |md
313 |     Drawn by Jiajie Chen @jiegec
314 | 
315 |     Based on data from Chips and Cheese, AMD
316 |   |
317 | }


--------------------------------------------------------------------------------
/docs/zen3.md:
--------------------------------------------------------------------------------
 1 | # AMD Zen3
 2 | 
 3 | ![](./zen3.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [AMD Next Generation "Zen 4" Core and 4th Gen AMD EPYC(TM) 9004 Server CPU](https://hc2023.hotchips.org/assets/program/conference/day1/CPU1/HC_Zen4_Epyc_Final_20230825%20-%20Embargoed%20until%20Aug%2029%202023.pdf)
 8 | - [AND Next Generation "Zen 3" Core](https://hc33.hotchips.org/assets/program/conference/day1/HC2021.C1.2%20AMD%20Mark%20Evers.pdf)
 9 | - [AMD’s Zen 4 Part 1: Frontend and Execution Engine](https://chipsandcheese.com/2022/11/05/amds-zen-4-part-1-frontend-and-execution-engine/)
10 | - [AMD Zen 3 Ryzen Deep Dive Review: 5950X, 5900X, 5800X and 5600X Tested](https://www.anandtech.com/show/16214/amd-zen-3-ryzen-deep-dive-review-5950x-5900x-5800x-and-5700x-tested/4)
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/zen4.d2:
--------------------------------------------------------------------------------
  1 | cpu : AMD Zen4 CPU {
  2 |   frontend: Frontend {
  3 |     bp: Branch Predictor {
  4 |       # Source: Chips and Cheese, AMD
  5 |       # "The L1 BTB has 1536 entries and predicts with zero prediction bubbles
  6 |       # for conditional and unconditional direct branches, and one cycle bubble
  7 |       # for calls, returns and indirect branches. The L2 BTB has 7680 entries
  8 |       # and creates three prediction bubbles if its prediction differs from that
  9 |       # of the L1 BTB."
 10 |       l1btb: 1536-entry L1 BTB, zero bubble
 11 | 
 12 |       # Source: Chips and Cheese, AMD
 13 |       l2btb: 7168/7680-entry L2 BTB, three bubble
 14 | 
 15 |       # Source: Chips and Cheese, AMD
 16 |       # "The processor implements a 3072-entry indirect target array"
 17 |       indir: 3072-entry Indirect Target Array
 18 | 
 19 |       # Source: Chips and Cheese, AMD
 20 |       # "The processor implements a 32-entry return address stack (RAS) per thread"
 21 |       ras: 32-entry Return Address Stack
 22 | 
 23 |       # Source: AMD
 24 |       2 taken predictions per cycle
 25 |     }
 26 | 
 27 |     l1ic: L1 IC {
 28 |       # Source: Chips and Cheese, AMD
 29 |       l1itlb: 64-entry L1 ITLB
 30 | 
 31 |       # Source: Chips and Cheese, AMD
 32 |       l2itlb: 512-entry L2 ITLB
 33 | 
 34 |       # Source: Chips and Cheese, AMD
 35 |       l1ic: 32KB 8-way L1 IC
 36 |     }
 37 | 
 38 |     fq: Fetch Queue
 39 |     bp -> fq
 40 |     fq -> l1ic
 41 | 
 42 |     # Source: AMD
 43 |     # "The fetch unit sends these bytes to the decode unit through a 24 entry
 44 |     # Instruction Byte Queue (IBQ), each entry holding 16 instruction bytes. In
 45 |     # SMT mode each thread has 12 dedicated IBQ entries."
 46 |     iq: 24x 16B Instruction Byte Queue
 47 | 
 48 |     # Source: AMD
 49 |     # "The processor fetches instructions from the instruction cache in 32-byte
 50 |     # blocks that are 16-byte aligned and contained within a 64-byte aligned
 51 |     # block. The processor can perform a 32-byte fetch every cycle."
 52 |     l1ic -> iq: 32B/cycle
 53 | 
 54 |     # Source: Chips and Cheese, AMD
 55 |     # AMD: "The decode unit scans two of these IBQ entries in a given cycle,
 56 |     # decoding a maximum of four instructions."
 57 |     decode: 4-way Decode
 58 |     iq -> decode: 2x IBQ entry
 59 | 
 60 |     # Source: Chips and Cheese, AMD
 61 |     # "The Op Cache is organized as an associative cache with 64 sets and 12
 62 |     # ways. At each set-way intersection is an entry containing up to 9 macro
 63 |     # ops."
 64 |     uopc: 768-entry 12-way, 9 mops/entry UOP Cache
 65 |     decode -> uopc
 66 |     bp -> uopc
 67 | 
 68 |     # Source: Chips and Cheese
 69 |     uop: UOP Queue
 70 |     uopc -> uop
 71 |     decode -> uop
 72 | 
 73 |     # Source: Chips and Cheese
 74 |     rename: 6-way Rename {
 75 |       Move Elimination
 76 |       Zero Idiom
 77 |     }
 78 |     uop -> rename
 79 |   }
 80 | 
 81 |   backend: Backend {
 82 |     # Source: Chips and Cheese, AMD
 83 |     # "The unit can receive up to 6 macro ops dispatched per cycle and track up
 84 |     # to 320 macro ops in-flight in non- SMT mode or 160 per thread in SMT mode"
 85 |     rob: 320-entry ROB
 86 | 
 87 |     # Source: Chips and Cheese
 88 |     bob: 62-taken-entry 118-not-taken-entry Branch Order Buffer
 89 | 
 90 |     rf: Register File {
 91 |       # Source: Chips and Cheese, AMD
 92 |       # "The integer physical register file (PRF) consists of 224 registers,
 93 |       # with up to 38 per thread mapped to architectural state or
 94 |       # micro-architectural temporary state. "
 95 |       irf: 224-entry Integer Register File
 96 | 
 97 |       # Source: Chips and Cheese, AMD
 98 |       # 238 reported by Chips and Cheese
 99 |       # 126 reported by AMD
100 |       flagsrf: 126-entry Flags Register File
101 | 
102 |       # Source: Chips and Cheese, AMD
103 |       vmaskrf: (52+16)-entry Vector Mask Register File
104 | 
105 |       # Source: Chips and Cheese, AMD
106 |       vrf: 192-entry 512b Vector Register File
107 |     }
108 | 
109 |     # Source: Chips and Cheese, AMD
110 |     sched1: 24-entry Scheduler \#1
111 | 
112 |     # Source: Chips and Cheese
113 |     pipe1: Pipe \#1 {
114 |       ALU
115 |     }
116 |     sched1 -> rf -> pipe1
117 | 
118 |     # Source: Chips and Cheese
119 |     pipe2: Pipe \#2 {
120 |       Branch
121 |     }
122 |     sched1 -> rf -> pipe2
123 | 
124 |     # Source: Chips and Cheese, AMD
125 |     sched2: 24-entry Scheduler \#2
126 | 
127 |     # Source: Chips and Cheese
128 |     pipe3: Pipe \#3 {
129 |       ALU
130 |     }
131 |     sched2 -> rf -> pipe3
132 | 
133 |     # Source: Chips and Cheese
134 |     pipe4: Pipe \#4 {
135 |       AGU
136 |     }
137 |     sched2 -> rf -> pipe4
138 | 
139 |     # Source: Chips and Cheese, AMD
140 |     sched3: 24-entry Scheduler \#3
141 | 
142 |     # Source: Chips and Cheese
143 |     pipe5: Pipe \#5 {
144 |       ALU
145 |     }
146 |     sched3 -> rf -> pipe5
147 | 
148 |     # Source: Chips and Cheese
149 |     pipe6: Pipe \#6 {
150 |       AGU
151 |     }
152 |     sched3 -> rf -> pipe6
153 | 
154 |     # Source: Chips and Cheese, AMD
155 |     sched4: 24-entry Scheduler \#4
156 | 
157 |     # Source: Chips and Cheese
158 |     pipe7: Pipe \#7 {
159 |       ALU
160 |       Branch
161 |     }
162 |     sched4 -> rf -> pipe7
163 | 
164 |     # Source: Chips and Cheese
165 |     pipe8: Pipe \#8 {
166 |       AGU
167 |     }
168 |     sched4 -> rf -> pipe8
169 | 
170 |     lsu: LSU {
171 |       # Source: Chips and Cheese, AMD
172 |       # 136 (Load Validation Queue) & 88 (Load Execution Queue) reported by Chips and Cheese
173 |       # 88 reported by AMD
174 |       # "The LS can track up to 48 uncompleted loads and up to 88 completed
175 |       # loads."
176 |       136-entry Load Validation Queue
177 |       88-entry Load (Execution) Queue
178 |       # "The LS unit utilizes a 64-entry store queue (STQ) which holds stores
179 |       # from dispatch until the store data can be written to the data cache."
180 |       64-entry Store Queue
181 | 
182 |       # Source: AMD
183 |       # 3 loads per cycle (max 2 if 256b)
184 |       # 2 stores per cycle (max 1 if 256b)
185 |       3 Load per cycle (2 if 256b)
186 |       2 Store per cycle (1 if 256b)
187 | 
188 |       # Source: AMD
189 |       # "4-cycle load-to-use integer load latency and 7-cycle load-to-use FP load latency."
190 |       4 cycle load to use latency
191 |       7 cycle load to FP use latency
192 |     }
193 | 
194 |     pipe4 -> lsu
195 |     pipe6 -> lsu
196 |     pipe8 -> lsu
197 | 
198 |     rob -> sched1
199 |     rob -> sched2
200 |     rob -> sched3
201 |     rob -> sched4
202 | 
203 |     # Source: Chips and Cheese, AMD
204 |     nsq: 64-entry Non/Pre-Scheduling Queue
205 |     rob -> nsq
206 | 
207 |     # Source: Chips and Cheese, AMD
208 |     sched5: 32-entry Scheduler \#5
209 |     nsq -> sched5
210 | 
211 |     # Source: Chips and Cheese
212 |     pipe9: Pipe \#9 {
213 |       FMA
214 |     }
215 |     sched5 -> rf -> pipe9
216 | 
217 |     # Source: Chips and Cheese
218 |     pipe10: Pipe \#10 {
219 |       FADD
220 |     }
221 |     sched5 -> rf -> pipe10
222 | 
223 |     # Source: Chips and Cheese
224 |     pipe11: Pipe \#11 {
225 |       FStore
226 |       F2I
227 |     }
228 |     sched5 -> rf -> pipe11 -> lsu
229 | 
230 |     # Source: Chips and Cheese, AMD
231 |     sched6: 32-entry Scheduler \#6
232 |     nsq -> sched6
233 | 
234 |     # Source: Chips and Cheese
235 |     pipe12: Pipe \#12 {
236 |       FMA
237 |     }
238 |     sched6 -> rf -> pipe12
239 | 
240 |     # Source: Chips and Cheese
241 |     pipe13: Pipe \#13 {
242 |       FADD
243 |     }
244 |     sched6 -> rf -> pipe13
245 | 
246 |     # Source: Chips and Cheese, AMD
247 |     pipe14: Pipe \#14 {
248 |       # FStore + F2I reported by Chips and Cheese
249 |       # FStore reported by AMD
250 |       FStore
251 |     }
252 |     sched6 -> rf -> pipe14 -> lsu
253 |   }
254 |   frontend.rename -> backend.rob
255 |   frontend.rename -> backend.bob
256 | 
257 |   mem: Memory {
258 |     l1: L1 DC {
259 |       # Source: Chips and Cheese, AMD
260 |       l1dtlb: 72-entry L1 DTLB
261 | 
262 |       # Source: Chips and Cheese, AMD
263 |       l2dtlb: 3072-entry L2 DTLB
264 | 
265 |       # Source: Chips and Cheese, AMD
266 |       l1dc: 32KB 8-way L1DC
267 | 
268 |       # Source: AMD
269 |       # "The AMD Zen4 microarchitecture has six hardware page table walkers to
270 |       # handle L2 TLB misses."
271 |       ptw: 6 Page Table Walkers
272 |     }
273 | 
274 |     l2: L2 {
275 |       # Source: Chips and Cheese, AMD
276 |       l2dc: 1MB 8-way L2 Cache, 14 cycle latency
277 |     }
278 | 
279 |     # Source: Chips and Cheese, AMD
280 |     l1 -> l2: 32B/cycle
281 | 
282 |     l3: L3 {
283 |       # Source: Chips and Cheese, AMD
284 |       l3dc: 32MB (4MB/core) 16-way L3 Cache, 50 cycle latency
285 |     }
286 |     l2 -> l3
287 |   }
288 |   frontend.l1ic -> mem.l2
289 |   backend.lsu -> mem.l1
290 | 
291 |   info: |md
292 |     Drawn by Jiajie Chen @jiegec
293 | 
294 |     Based on data from Chips and Cheese, Anandtech and AMD
295 |   |
296 | }


--------------------------------------------------------------------------------
/docs/zen4.md:
--------------------------------------------------------------------------------
 1 | # AMD Zen4
 2 | 
 3 | ![](./zen4.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [AMD Zen 4 Ryzen 9 7950X and Ryzen 5 7600X Review: Retaking The High-End](https://www.anandtech.com/show/17585/amd-zen-4-ryzen-9-7950x-and-ryzen-5-7600x-review-retaking-the-high-end/8)
 8 | - [AMD’s Zen 4 Part 1: Frontend and Execution Engine](https://chipsandcheese.com/2022/11/05/amds-zen-4-part-1-frontend-and-execution-engine/)
 9 | - [AMD’s Zen 4, Part 2: Memory Subsystem and Conclusion](https://chipsandcheese.com/2022/11/08/amds-zen-4-part-2-memory-subsystem-and-conclusion/)
10 | - [Inside the AMD Zen 4 CPU architecture](https://www.custompc.com/inside-amd-zen-4-ryzen-cpu-architecture)
11 | - [AMD Reveals More Zen 5 CPU Core Details](https://www.phoronix.com/review/amd-zen-5-core)
12 | - [AMD Next Generation "Zen 4" Core and 4th Gen AMD EPYC(TM) 9004 Server CPU](https://hc2023.hotchips.org/assets/program/conference/day1/CPU1/HC_Zen4_Epyc_Final_20230825%20-%20Embargoed%20until%20Aug%2029%202023.pdf)
13 | - [AMD Next-Generation “Zen 4” Core and 4th Gen AMD EPYC Server CPUs](https://ieeexplore.ieee.org/document/10466769)
14 | 
15 | 


--------------------------------------------------------------------------------
/docs/zen5.md:
--------------------------------------------------------------------------------
 1 | # AMD Zen5
 2 | 
 3 | ![](./zen5.svg)
 4 | 
 5 | References:
 6 | 
 7 | - [AMD Reveals More Zen 5 CPU Core Details](https://www.phoronix.com/review/amd-zen-5-core)
 8 | - [Zen 5’s 2-Ahead Branch Predictor Unit: How a 30 Year Old Idea Allows for New Tricks](https://chipsandcheese.com/2024/07/26/zen-5s-2-ahead-branch-predictor-unit-how-30-year-old-idea-allows-for-new-tricks/)
 9 | - [Zen 5’s Leaked Slides](https://chipsandcheese.com/2023/10/08/zen-5s-leaked-slides/)
10 | - [AMD’s Strix Point: Zen 5 Hits Mobile](https://chipsandcheese.com/2024/08/10/amds-strix-point-zen-5-hits-mobile/)
11 | - [AMD’s Ryzen 9950X: Zen 5 on Desktop](https://chipsandcheese.com/2024/08/14/amds-ryzen-9950x-zen-5-on-desktop/)
12 | - [Zen 5 Dieshot](https://www.bilibili.com/opus/965843745820901377)
13 | - [Strix Point Dieshot](https://www.bilibili.com/opus/959217298443337751)
14 | - [Software Optimization Guide for the AMD Zen5 Microarchitecture](https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/software-optimization-guides/58455.zip)
15 | - [Discussing AMD’s Zen 5 at Hot Chips 2024](https://chipsandcheese.com/2024/09/15/discussing-amds-zen-5-at-hot-chips-2024/)
16 | - [AMD EPYC 9965 "Turin Dense" Delivers Better Performance/Power Efficiency vs. AmpereOne 192-Core ARM CPU](https://www.phoronix.com/review/amd-epyc-9965-ampereone)
17 | - [AMD EPYC 9755 / 9575F / 9965 Benchmarks Show Dominating Performance](https://www.phoronix.com/review/amd-epyc-9965-9755-benchmarks)
18 | - [5TH GEN AMD EPYC™ PROCESSOR ARCHITECTURE](https://www.amd.com/content/dam/amd/en/documents/epyc-business-docs/white-papers/5th-gen-amd-epyc-processor-architecture-white-paper.pdf)
19 | - [The AMD Zen 5 Microarchitecture: Powering Ryzen AI 300 Series For Mobile and Ryzen 9000 for Desktop](https://www.anandtech.com/show/21469/amd-details-ryzen-ai-300-series-for-mobile-strix-point-with-rdna-35-igpu-xdna-2-npu)
20 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | docs/main.py


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: CPU Microarchitecture Diagrams
 2 | site_url: https://jia.je/cpu
 3 | repo_url: https://github.com/jiegec/cpu
 4 | edit_uri: edit/master/docs/
 5 | nav:
 6 |   - CPU Microarchitecture Diagrams: index.md
 7 |   - Comparisons across microarchitectures: comparison.md
 8 |   - Reverse engineered conditional branch predictors: cbp.md
 9 |   - AArch64:
10 |       - ARM Cortex-A75: cortex_a75.md
11 |       - ARM Cortex-A77: cortex_a77.md
12 |       - ARM Cortex-X1: cortex_x1.md
13 |       - ARM Cortex-X2: cortex_x2.md
14 |       - ARM Cortex-X3: cortex_x3.md
15 |       - ARM Cortex-X4: cortex_x4.md
16 |       - ARM Cortex-X925: cortex_x925.md
17 |       - ARM Neoverse-V2: neoverse_v2.md
18 |       - Ampere One: ampere_one.md
19 |       - Apple M1 P-core (Firestorm): firestorm.md
20 |       - Apple M2 P-core (Avalanche): avalanche.md
21 |       - Apple M3 P-core: m3_pcore.md
22 |       - Apple M4 P-core: m4_pcore.md
23 |       - Phytium Xiaomi: xiaomi.md
24 |       - Qualcomm Oryon: oryon.md
25 |   - AMD64:
26 |       - AMD Zen1: zen1.md
27 |       - AMD Zen2: zen2.md
28 |       - AMD Zen3: zen3.md
29 |       - AMD Zen4: zen4.md
30 |       - AMD Zen5: zen5.md
31 |       - Intel Skylake: skylake.md
32 |       - Intel Ice Lake (Sunny Cove): sunny_cove.md
33 |       - Intel Alder Lake P-core (Golden Cove): golden_cove.md
34 |       - Intel Alder Lake E-core (Gracemont): gracemont.md
35 |       - Intel Meteor Lake P-core (Redwood Cove): redwood_cove.md
36 |       - Intel Meteor Lake E-core (Crestmont): crestmont.md
37 |       - Intel Lunar Lake P-core (Lion Cove): lion_cove.md
38 |       - Intel Lunar Lake E-core (Skymont): skymont.md
39 |   - LoongArch64:
40 |       - Loongson 3A6000: 3a6000.md
41 |   - RISC-V64:
42 |       - SiFive P550: p550.md
43 |       - SiFive P870: p870.md
44 |   - Dieshot: dieshot.md
45 | theme:
46 |   name: material
47 |   icon:
48 |     repo: fontawesome/brands/github
49 |   features:
50 |     - content.action.edit
51 |     - navigation.expand
52 |     - navigation.tracking
53 |     - navigation.sections
54 |     - search.suggest
55 |     - search.highlight
56 |     - search.share
57 | plugins:
58 |   - git-revision-date-localized:
59 |       enable_creation_date: true
60 |   - glightbox:
61 |       height: 100vh
62 |   - search
63 |   - offline
64 |   - social:
65 |       cards_layout_options:
66 |         font_family: Noto Sans SC
67 |   - macros
68 | markdown_extensions:
69 |   - pymdownx.superfences:
70 |       custom_fences:
71 |         - name: mermaid
72 |           class: mermaid
73 |           format: !!python/name:pymdownx.superfences.fence_code_format
74 | extra:
75 |   analytics:
76 |     provider: google
77 |     property: G-3109FRSVTT
78 | copyright: Copyright &copy; 2024 Jiajie Chen
79 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "cpu"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Jiajie Chen <c@jia.je>"]
 6 | readme = "README.md"
 7 | package-mode = false
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.9"
11 | mkdocs = "^1.6.0"
12 | mkdocs-material = "^9.5.30"
13 | mkdocs-glightbox = "^0.4.0"
14 | mkdocs-git-revision-date-localized-plugin = "^1.2.6"
15 | cairosvg = "^2.7.1"
16 | mkdocs-macros-plugin = "^1.0.5"
17 | pandas = "^2.2.2"
18 | tabulate = "^0.9.0"
19 | 
20 | 
21 | [build-system]
22 | requires = ["poetry-core"]
23 | build-backend = "poetry.core.masonry.api"
24 | 


--------------------------------------------------------------------------------