Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

USQADD (scalar, D)

Test 1: uops

Code:

  usqadd d0, d1
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004303310011100010007590510001000200011000
1004303310011100010007590510001000200011000
1004303310011100010007590510001000200011000
1004303310011100010007590510001000200011000
1004303310011100010007590510001000200011000
1004303310011100010007590510001000200011000
1004303310011100010007590510001000200011000
1004303310011100010007590510001000200011000
1004303310011100010007590510001000200011000
1004303310011100010007590510001000200011000

Test 2: Latency 1->1

Code:

  usqadd d0, d1
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000300768905101002001000620020092110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024300331002121100002010000707689051002020100062020000111000010
10024300331002121100002010000707689051002020100002020000111000010
10024300331002121100002010000707689051002020100002020000111000010
10024300331002121100002010000707689051002020100002020000111000010
10024300331002121100002010000707689051002020100002020000111000010
10024300331002121100002010000707689051002020100002020000111000010
10024300331002121100002010000707689051002020100002020000111000010
10024300331002121100002010000707689051002020100002020000111000010
10024300331002121100002010000707689051002020100002020000111000010
10024300331002121100002010000707689051002020100002020000111000010

Test 3: Latency 1->2

Code:

  usqadd d0, d0
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020096110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100
1020430033101011011000010010000300768905101002001000420020008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1002430033100212110000201000070768905100202010004200200001101000010
1002430033100212110000201000070768905100202010000200200001101000010
1002430033100212110000201000070768905100202010000200200001101000010
1002430033100212110000201000070768905100202010000200200001101000010
1002430033100212110000201000070768905100202010000200200001101000010
1002430033100212110000201000070768905100202010000200200001101000010
1002430033100212110000201000070768905100202010000200200001101000010
1002430033100212110000201000070768905100202010000200200001101000010
1002430033100212110000201000070768905100202010000200200001101000010
1002430033100212110000201000070768905100202010000200200001101000010

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  usqadd d0, d8
  movi v1.16b, 0
  usqadd d1, d8
  movi v2.16b, 0
  usqadd d2, d8
  movi v3.16b, 0
  usqadd d3, d8
  movi v4.16b, 0
  usqadd d4, d8
  movi v5.16b, 0
  usqadd d5, d8
  movi v6.16b, 0
  usqadd d6, d8
  movi v7.16b, 0
  usqadd d7, d8
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5011

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1602044047980109101800081008001230032005280112200800122001600241160000100
1602044010880110101800091008001330032005280112200800122001600261160000100
1602044008680109101800081008001230032005280112200800122001600241160000100
1602044008680109101800081008001230032005280112200800122001600241160000100
1602044008680109101800081008001230032005280112200800122001600241160000100
1602044008680109101800081008001230032005280112200800122001600241160000100
1602044008680109101800081008001230032005280112200800122001600261160000100
1602044009680110101800091008001330032005280112200800122001600261160000100
1602044009880110101800091008001330032005280112200800122001600241160000100
1602044008680109101800081008001230032005280112200800122001600241160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5052

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
160024438338001911800081080012030032005280022200800122001600001016000010
160024412258001111800001080000030032000080010200800002001600001016000010
160024404268001111800001080000030032000080010200800002001600001016000010
160024404308001111800001080000030032000080010200800002001600001016000010
160024403998001111800001080000030032000080010200800002001600001016000010
160024404568001111800001080000030032000080010200800002001600001016000010
160024403898001111800001080000030032000080010200800002001600001016000010
160024404138001111800001080000030032000080010200800002001600001016000010
160024404068001111800001080000030032000080010200800002001600001016000010
160024404228001111800001080000030032000080010200800002001600001016000010

Test 5: throughput

Count: 16

Code:

  usqadd d0, d16
  usqadd d1, d16
  usqadd d2, d16
  usqadd d3, d16
  usqadd d4, d16
  usqadd d5, d16
  usqadd d6, d16
  usqadd d7, d16
  usqadd d8, d16
  usqadd d9, d16
  usqadd d10, d16
  usqadd d11, d16
  usqadd d12, d16
  usqadd d13, d16
  usqadd d14, d16
  usqadd d15, d16
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020480107160105101160004100160008300640424160205200160109202032022420160000100
16020480224160205101160104100160108300640044160110200160014200032031810160000100
16020480256160255103160152102160156300640044160110200160014200032002410160000100
16020480035160105101160004100160008300640036160108200160012200032002410160000100
16020480035160105101160004100160008300640036160108200160012200032002410160000100
16020480035160105101160004100160008300640036160108200160012200032002410160000100
16020480035160105101160004100160008300640036160108200160012200032002410160000100
16020480035160105101160004100160008300640216160156200160063200032002410160000100
16020480035160105101160004100160008300640036160108200160012200032002410160000100
16020480035160105101160004100160008300640036160108200160012200032002410160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1600248018716001511160004010160008306400361600182016001220320000116000010
1600248004516001111160000010160000306400001600102016000020320000116000010
1600248003516001111160000010160000306400001600102016000020320000116000010
1600248003516001111160000010160000306402161600662016006520320000116000010
1600248003516001111160000010160000306400001600102016000020320000116000010
1600248003516001111160000010160000306400001600102016000020320000116000010
1600248003516001111160000010160000306400001600102016000020320000116000010
1600248003516001111160000010160000306400001600102016000020320000116000010
1600248003516001111160000010160000306400001600102016000020320000116000010
1600248003516001111160000010160000306400001600102016000020320000116000010