Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SETF8

Test 1: uops

Code:

  setf8 w1
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001

Test 2: Latency 2->1

Chain cycles: 1

Code:

  setf8 w1
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
20204200302010120101201085193392010820216302212000110100
20204200302010120101201085194342010720214302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20205200602011520115201475195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
20024200302001120011200185196382001820036300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200185194952001820036300382000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010

Test 3: Latency 2->2

Code:

  setf8 w0
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
102041003010201102011020925470910208102082022810101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102051006010215102151025025470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
10024100301002110021100292552361002910032200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020201741005310
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202555931005910059200981003210

Test 4: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7890

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
16020463345160115160115160120681658160309160409160222160013100
16020463138160113160113160119687412160121160222160226160019100
16020463102160113160113160118686775160119160220160216160011100
16020463065160115160115160120688894160118160220160216160008100
16020463147160112160112160118691935160118160220160220160011100
16020463170160112160112160118690544160123160224160224160014100
16020463099160111160111160116689652160118160220160216160011100
16020463102160120160120160126688283160118160220160220160013100
16020463108160112160112160118688787160118160220160220160015100
16020463091160114160114160120687081160120160220160220160012100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7882

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
1600246447016002616002616003369893516003316004416002016000110
1600246323916001116001116001069585316001016002016002016000110
1600246309616001116001116001069815416001016002016002016000110
1600246303216001116001116001070041816001016002016002016000110
1600246317916001116001116001070327016001016002016002016000110
1600246306616001116001116001069908216001016002016002016000110
1600246304416001116001116001070583016001016002016002016000110
1600246304216001116001116001070110916001016002016002016000110
1600246306016001116001116001069734616001016002016002016000110
1600246307216001116001116001069846816001016002016002016000110

Test 5: throughput

Count: 4

Code:

  fcmp s0, s0
  setf8 w0
  setf8 w0
  setf8 w0
  setf8 w0
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5998

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
5020424034501044010110003401121000431520540013501124020910003802242000840001100
5020424004501064010310003401121000431531640013501124020910003802242000840002100
5020424007501044010110003401091000331486440012501124020910003802242000840003100
5020423988501054010210003401121000431489140012501124020910003802182000640001100
5020423973501094010510004401161000431519740012501124020910003802182000640002100
5020423968501044010110003401121000431520040017501164021210004802182000640001100
5020423983501034010110002401091000331500140017501164021210004802242000840001100
5020424004501064010310003401121000431500340017501164021210004802182000640001100
5020423991501044010110003401121000431489140012501124020910003802242000840002100
5020423987501044010110003401121000431500340017501164021210004802242000840001100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5998

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
500242411150014400121000240022100043163094000050010400201000080020200004000110
500242396250011400111000040010100003162494000050010400201000080020200004000110
500242397950011400111000040010100003166394000050010400201000080020200004000110
500242399350011400111000040010100003159664000050010400201000080020200004000110
500242405350011400111000040010100003163634000050010400201000080020200004000110
500242402750011400111000040010100003154554000050010400201000080020200004000110
500242397550011400111000040010100003155104000050010400201000080020200004000110
500242399350011400111000040010100003168384000050010400201000080020200004000110
500242402750011400111000040010100003168114000050010400201000080020200004000110
500242397350011400111000040010100003162094000050010400201000080020200004000110

Test 6: throughput

Count: 7

Code:

  ands xzr, xzr, xzr
  setf8 w0
  setf8 w0
  setf8 w0
  setf8 w0
  setf8 w0
  setf8 w0
  setf8 w0
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5567

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
802043903680107801078011605495530080111802120014022480013100
802043896180107801078011405502420080116802160014021480003100
802043900280104801048011105498030080111802120014022080003100
802043898480103801038010805486200080153802530014028480033100
802043896680109801098011405489820080120802220014021480004100
802053904480134801348015405511170080114802160014022080003100
802043896580104801048011405497560080116802160014022880007100
802043896780104801048011105483010080111802120014022080006100
802043894480104801048011405504240080116802160014021480003100
802043902280107801078011605502760080114802160014022880005100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5562

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
8002439010800308003000800410548186800208002014002000800110010
8002438958800218002100800200550251800208002014002000800110010
8002438977800218002100800200547971800208002014002000800110010
8002438955800218002100800200549142800208002014002000800110010
8002538955800618006100800770551652800208002014002000800110010
8002438950800218002100800200550318800208002014002000800110010
8002438946800218002100800200547256800798008014002000800110010
8002438899800218002100800200548495800208002014002000800110010
8002438881800218002100800200550730800208002014002000800110010
8002438905800218002100800200550162800208002014002000800110010