Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SETF16

Test 1: uops

Code:

  setf16 w1
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001

Test 2: Latency 2->1

Chain cycles: 1

Code:

  setf16 w1
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
20204200302010120101201075190762010820214302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
20024200302001120011200185195072001720032300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010

Test 3: Latency 2->2

Code:

  setf16 w0
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
102041003010201102011021225450610211102142022810101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
10024100301002110021100292552361002910032200201001110
10024100301002110021100292552361002910032200201001110
10024100301002110021100202551911002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110

Test 4: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  setf16 w0
  ands xzr, xzr, xzr
  setf16 w0
  ands xzr, xzr, xzr
  setf16 w0
  ands xzr, xzr, xzr
  setf16 w0
  ands xzr, xzr, xzr
  setf16 w0
  ands xzr, xzr, xzr
  setf16 w0
  ands xzr, xzr, xzr
  setf16 w0
  ands xzr, xzr, xzr
  setf16 w0
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7889

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
16020463270160112160112160118690899160118160220160220160013100
16020463117160118160118160124688223160119160220160220160010100
16020463100160110160110160115689181160120160220160216160011100
16020463127160112160112160118688236160118160220160220160012100
16020463082160112160112160118687157160117160218160224160014100
16020463102160113160113160118690367160117160218160220160013100
16020463127160112160112160118687037160118160220160220160012100
16020463128160111160111160116690896160118160220160220160014100
16020463072160114160114160120687961160118160220160224160019100
16020563148160154160154160160688682160117160218160220160013100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7880

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
1600246441016002116002116002670194116003016004016002016000110
1600246320816001116001116001070173416001016002016002016000110
1600246303116001116001116001069692916001016002016002016000110
1600246303616001116001116001069983416001016002016002016000110
1600246306916001116001116001070314616001016002016002016000110
1600246303116001116001116001069756216001016002016002016000110
1600246301216001116001116001069556516001016002016002016000110
1600246302616001116001116001070147016001016002016002016000110
1600246305816001116001116001069965716001016002016002016000110
1600246304816001116001116001069709216001016002016002016000110

Test 5: throughput

Count: 4

Code:

  fcmp s0, s0
  setf16 w0
  setf16 w0
  setf16 w0
  setf16 w0
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5997

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
5020424059501074010410003401141000431539840012501124020910003802342001040006100
5020423973501074010410003401141000431552040017501164021210004802242000840002100
5020423979501064010310003401091000331513240017501164021210004802242000840002100
5020423999501054010210003401121000431494740017501164021210004802182000640003100
5020423967501034010110002401091000331598740017501164021210004802242000840001100
5020423977501034010110002401121000431598740017501164021210004802182000640003100
5020423987501094010510004401161000431513240017501164021210004802182000640001100
5020423976501054010210003401121000431513240017501164021210004802182000640001100
5020423967501044010110003401121000431598740017501164021210004802182000640003100
5020423981501064010310003401091000331598740017501164021210004802182000640001100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5998

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
500242406550018400151000340024100043159234001350022400291000380020200004000110
500242402850011400111000040010100003158494000050010400201000080020200004000110
500242395650011400111000040010100003161434000050010400201000080020200004000110
500242399350011400111000040010100003157464000050010400201000080020200004000110
500242399350011400111000040010100003161154000050010400201000080020200004000110
500242395450011400111000040010100003165754000050010400201000080020200004000110
500242400950011400111000040010100003172394000050010400201000080020200004000110
500242399150011400111000040010100003162694000050010400201000080020200004000110
500242395450011400111000040010100003163174000050010400201000080020200004000110
500242395650011400111000040010100003160884000050010400201000080020200004000110

Test 6: throughput

Count: 7

Code:

  ands xzr, xzr, xzr
  setf16 w0
  setf16 w0
  setf16 w0
  setf16 w0
  setf16 w0
  setf16 w0
  setf16 w0
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5568

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
802043906080107801078011605486030080119802200014022080003100
802043901580112801128012005505430080108802080014022080005100
802043897480104801048011405505460080111802120014022880004100
802043896980105801058011405491850080108802080014021480003100
802043904280105801058011105487520080119802200014022880009100
802043897380103801038011105509850080108802080014022880006100
802043896680104801048010805493100080150802520014021480003100
802043899780108801088011605498990080111802120014023480007100
802043897080103801038010805505400080117802180014022880005100
802043898480105801058011405484620080114802160014022080003100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5562

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
8002438957800328003208004454938080020800201400208001110
8002438912800218002108002055066880020800201400208001110
8002438981800218002108002054810780020800201400208001110
8002438945800218002108002055064980020800201400208001110
8002438924800218002108002055073780020800201400208001110
8002438904800218002108002055066880020800201400208001110
8002438962800218002108002055033780020800201400208001110
8002438972800218002108002054915980020800201400208001110
8002438968800218002108002055033280020800201401228005010
8002438887800218002108002055025180020800201400208001110