Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

RMIF

Test 1: uops

Code:

  rmif x1, #1, #1
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001

Test 2: Latency 2->1

Chain cycles: 1

Code:

  rmif x1, #1, #1
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
20204200302010120101201075193292010720214302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
20024200302001120011200185195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010

Test 3: Latency 2->2

Code:

  rmif x0, #1, #1
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
102051006010215102151024625477010208102082022810101100
102041003010201102011020825470910208102082022810101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
10024100981002110021100282551501002810032200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110

Test 4: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  rmif x0, #1, #1
  ands xzr, xzr, xzr
  rmif x0, #1, #1
  ands xzr, xzr, xzr
  rmif x0, #1, #1
  ands xzr, xzr, xzr
  rmif x0, #1, #1
  ands xzr, xzr, xzr
  rmif x0, #1, #1
  ands xzr, xzr, xzr
  rmif x0, #1, #1
  ands xzr, xzr, xzr
  rmif x0, #1, #1
  ands xzr, xzr, xzr
  rmif x0, #1, #1
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7889

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
16020463287160112160112160118689871160124160226160218160014100
16020463093160114160114160118686358160118160220160220160015100
16020463112160112160112160118690096160121160222160220160010100
16020463082160112160112160118688787160118160220160220160012100
16020463119160113160113160117689112160123160224160220160012100
16020463120160113160113160117692303160120160220160220160011100
16020463115160112160112160118688536160118160220160220160012100
16020463107160112160112160118688893160120160220160220160015100
16020463088160112160112160118690662160118160220160220160012100
16020463087160111160111160115691187160118160220160220160012100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7880

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
1600246447016002616002616003369623816003016004016002016000110
1600246325416001116001116001070201516001016002016002016000110
1600246310416001116001116001069434716001016002016002016000110
1600246307816001116001116001070262116001016002016002016000110
1600246302216001116001116001070086116001016002016002016000110
1600246299316001116001116001070257116001016002016002016000110
1600246305916001116001116001070455316001016002016002016000110
1600246303016001116001116001069889016001016002016002016000110
1600246300916001116001116001070069816001016002016002016000110
1600246303016001116001116001070455316001016002016002016000110

Test 5: throughput

Count: 4

Code:

  fcmp s0, s0
  rmif x0, #1, #1
  rmif x0, #1, #1
  rmif x0, #1, #1
  rmif x0, #1, #1
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5998

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
5020423999501064010410002401111000331490340017501164021210004802342001040005100
5020423984501064010310003401091000331502940018501164021210004802822002040033100
5020423969501054010210003401121000431598740017501164021210004802182000640001100
5020423983501044010110003401091000331589940077501914027210019802182000640001100
5020423966501044010110003401091000331556340012501124020910003802322000840005100
5020423967501044010110003401121000431561740017501164021210004802182000640003100
5020423966501044010110003401091000331556340012501124020910003802242000840002100
5020423999501054010210003401121000431570240012501124020910003802182000640001100
5020423979501064010310003401091000331552040017501164021210004802242000840002100
5020424004501064010310003401141000431529640012501124020910003802242000840001100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
500242415150014400121000240022100043159774000050010400201000080020200004000110
500242396450011400111000040010100003160714004550065400641001180020200004000110
500242397750011400111000040010100003156144000050010400201000080020200004000110
500242399950011400111000040010100003155244000050010400201000080020200004000110
500242400450011400111000040010100003158284000050010400201000080020200004000110
500242399350011400111000040010100003156034000050010400201000080020200004000110
500242398650011400111000040010100003157474000050010400201000080020200004000110
500242399350011400111000040010100003163014000050010400201000080020200004000110
500242429650169401381003140137100313156024000050010400201000080020200004000110
500242402350011400111000040010100003153964000050010400201000080020200004000110

Test 6: throughput

Count: 7

Code:

  ands xzr, xzr, xzr
  rmif x0, #1, #1
  rmif x0, #1, #1
  rmif x0, #1, #1
  rmif x0, #1, #1
  rmif x0, #1, #1
  rmif x0, #1, #1
  rmif x0, #1, #1
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5568

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
8020439011801098010980117551078801118021214021480005100
8020439035801058010580108550486801118021214022080004100
8020439032801078010780116548963801168021614023880015100
8020438941801048010480114547262801168021614022880004100
8020438989801068010680111549465801118021214022880005100
8020438950801068010680114548458801148021614021480004100
8020438985801038010380111551999801168021614022880006100
8020438914801028010280111550000801168021614022880007100
8020438958801048010480111548033801088020814022880007100
8020438942801058010580114547687801118021214022880004100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5561

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
800243919680031800318004254978380036800361400208001110
800243891080021800218002055165280020800201400208001110
800243895680021800218002055169780020800201400208001110
800243904180021800218002054979280020800201400208001110
800243892180021800218002055152180020800201400208001110
800243895680021800218002054929180020800201400208001110
800243890480021800218002054904480020800201400208001110
800243888180021800218002054962180020800201400208001110
800243895680021800218002055028780020800201400208001110
800243894180021800218002054971780020800201401248005310