Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SUQADD (vector, 4H)

Test 1: uops

Code:

  suqadd v0.4h, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)033f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb miss (a1)l1d cache writeback (a8)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10043037236125482510001000100039831303018303730372415328951000100020003037303711100110002073116112630100030383038303830383038
10043037226125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037226125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037236125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037226125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037236125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
100430372212425482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037226125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037236125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037236125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  suqadd v0.4h, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)st unit uop (a7)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250612954825101001001000010010000500427731303001803003730037282653287451010020010000200200003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731303001803003730037282653287451010020010000200200003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731313001803003730037282653287451010020010000200200003003730226111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372240612954825101001001000010010000500427731313001803003730037282653287451010020010000200200003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372259612954825101001001000010010000500427731313001803003730037282653287451010020010000200200003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731313001803003730037282653287451010020010000200200003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731313001803003730037282653287451010020010000200200003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731303001803003730037282653287451010020010330200200003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731303001803003730037282653287451010020010000200200003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
102043003722515612954825101001001000010010000500427731303001803003730037282653287451010020010000200200003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9facbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722506129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100006405163429630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000100006422162229630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100006402162229662010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000100006422163229630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100006422162229630010000103003830038300383003830038
100243003722506129548251001010100001210000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100006404162229630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000100006402162229630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100006402162229630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100006422162229630010000103003830038300383003830038
100243003722406129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100006404162229630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  suqadd v0.4h, v0.4h
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)dde0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250612954725101001001000010010000500427716003001830037300372827162873310100200100082002001630037300371110201100991001001000010010111718160296460100001003003830038300383003830038
10204300372250612954725101001001000010010000500427716003001830037300372827172874010100200100082002001630037300371110201100991001001000010000111718160296450100001003003830038300383003830038
10204300372250612954725101001001000010010000500427716013001830037300372827172874110100200100082002001630037300371110201100991001001000010000111717160296460100001003003830038300383003830038
10204300372250612954725101001001000010010000500427716003001830037300372827172874010100200100082002001630037300371110201100991001001000010000111717160296460100001003003830038300383003830038
10204300372250822954725101001001000010010000500427716003001830037300372827162874010100206100082002001630037300371110201100991001001000010000111717160296461100001003003830038300383003830038
10204300372250612954725101001001000010010000500427716003001830037300372827162874010100200100082002001630037300371110201100991001001000010000111717160296450100001003003830038300383003830038
10204300372250612954725101001001000010010000500427716003001830037300372827162874110100200100082002001630037300371110201100991001001000010000111718160296460100001003003830038300383003830038
102043003722506312954725101001001000010010000500427716003001830037300372827172874010100200100082002001630037300371110201100991001001000010020111718160296460100001003003830038300383003830038
102043003722507262954725101001001000010010000500427716003001830037300372827162874010100200100082002001630037300371110201100991001001000010000111718160296460100001003003830038300383003830038
10204300372250612954725101001001000010010000500427716003001830037300372827162874110100200100082002001630037300371110201100991001001000010000111717160296460100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225006129547251001010100001010000504277160030018300373003728286032876710010201000020200003003730037111002110910101000010000640316332962910000103003830038300383003830038
1002430037225006129547251001010100001010000504277160130018300373003728286032876710010201000020200003003730037111002110910101000010000640316332962910000103003830038300383003830038
1002430037225006129547251001010100001010000504277160030018300373003728286032876710010201000020200003003730037111002110910101000010000640316332962910000103007130038300383003830038
1002430037225006129547251001010100001010000504277160130018300373003728286032876710010201000020200003003730037111002110910101000010000640316332962910000103003830038300383003830038
1002430037225006129547251001010100001010000504277160030018300373003728286032876710010201000020200003003730037111002110910101000010000640316332962910000103003830038300383003830038
1002430037225006129547251001010100001010000504277160030018300373003728286032876710010201000020200003003730037111002110910101000010000640316332962910000103003830038300383003830038
1002430037224006129547251001010100001010000504277160030018300373003728286032876710010201000020200003003730037111002110910101000010000640316332962910000103003830038300383003830038
10024300372250021229547251001010100001010000504277160030018300373003728286032876710010201000020200003003730037111002110910101000010000640316332962910000103003830038300383003830038
1002430037225006129547251001010100001010000504277160030018300373003728286032876710010201000020200003003730037111002110910101000010000640316432962910000103003830038300383003830038
1002430037225006129547251001010100001010000504277160030018300373003728286032876710010201000020200003003730037111002110910101000010000640316332962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  suqadd v0.4h, v8.4h
  movi v1.16b, 0
  suqadd v1.4h, v8.4h
  movi v2.16b, 0
  suqadd v2.4h, v8.4h
  movi v3.16b, 0
  suqadd v3.4h, v8.4h
  movi v4.16b, 0
  suqadd v4.4h, v8.4h
  movi v5.16b, 0
  suqadd v5.4h, v8.4h
  movi v6.16b, 0
  suqadd v6.4h, v8.4h
  movi v7.16b, 0
  suqadd v7.4h, v8.4h
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk data (08)191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420091150000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000001111012471688200621600001002006620066200662006620066
16020420065150009292580116100800161008002850064019602004520065200656128044720080028200160056200652006511160201100991001001600001000001111012771683200621600001002006620066200662006620066
16020420065150000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000001111012781687200621600001002006620066200662006620066
160204200651500021292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000001111012781688200621600001002006620066200662006620066
16020420065151000502580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000001111012781638200621600001002006620066200662006620066
1602042006515000402292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000001111012681688200621600001002006620066200662006620066
160204200651500036011212580116100800161008002850064019612004520065200656128044320080028200160056200652006511160201100991001001600001000001111012771687200621600001002006620066200662006620066
16020420065150004712002580116100800161008002850064019602004520065200656128012820080132200160056200652006511160201100991001001600001000001111012681683200621600001002006620066200662006620066
160204200651500024292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000001111012281688200621600001002006620066200662006620066
1602042006515000396292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000001111012881688200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss instruction (0a)181e3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024200851501000005725800101080000108000050640000111020029200482004832280010208010520160000200482004811160021109101016000010000100491331432211117262004516160000102004920049200492004920049
160024200481500000015725800101080000108000050640000111020029200482004832280010208000020160000200482004811160021109101016000010000100501351252211127262004516160000102004920049200492004920049
160024200481501010015725800101080000108000050640000111020029200482004832280010208000020160000200482004811160021109101016000010000100521351262211127272004516160000102004920049200492004920049
160024200481501010005725800101080000108000050640000111020029200482004832280010208000020160000200482004811160021109101016000010000100491361262211125252004516160000102004920049200492004920049
160024200481500010014525800101080000108000050640000111020029200482004832280010208000020160000200482004811160021109101016000010000100481351272211126152004516160000102004920049200492004920049
160024200481501000915725800101080000108000050640000111020029200482004832280010208000020160836200482004811160021109101016000010000100491351262211116252004516160000102004920049200492004920049
160024200481501000016425800101080000108000050640000111020029200482004838580010208000020160000200482004811160021109101016000010100100501351252211121252004516160000102004920049200492004920049
160024200481501110005725800101080105108000050640000111020029200482004832280010208000020160000200482004811160021109101016000010043100481351252211115272004516160000102004920049200492004920049
160024200481501010005725800101080000108000050640000111020029200482004832280010208000020160000200482004811160021109101016000010000100491351252211226262004516160000102004920049200492004920049
160024200481501010005725800101080000108000050640000111020029200482004832280010208000020160000200482004811160021109101016000010000100471351252211126212004516160000102011920049200492004920049

Test 5: throughput

Count: 16

Code:

  suqadd v0.4h, v16.4h
  suqadd v1.4h, v16.4h
  suqadd v2.4h, v16.4h
  suqadd v3.4h, v16.4h
  suqadd v4.4h, v16.4h
  suqadd v5.4h, v16.4h
  suqadd v6.4h, v16.4h
  suqadd v7.4h, v16.4h
  suqadd v8.4h, v16.4h
  suqadd v9.4h, v16.4h
  suqadd v10.4h, v16.4h
  suqadd v11.4h, v16.4h
  suqadd v12.4h, v16.4h
  suqadd v13.4h, v16.4h
  suqadd v14.4h, v16.4h
  suqadd v15.4h, v16.4h
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020440061299363025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100001111011801600400361600001004004040040400404004040040
160204400392993650525160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100001111011801600400361600001004004040040400404004040040
160204400393002463025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100001111011801600400361600001004004040040400404004040040
160204400393000261251601081001600081001600205001280132140020400394003919977261999016012020016003220032006440039400391116020110099100100160000100001111011801600400361600001004004040040400404004040040
16020440039299273025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100001111011801600400361600001004004040040400404004040040
160204400393003363025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100001111011801600400361600001004004040040400404004040040
1602044003930027930251601081001600081001600205001280132140020400394003919977619990160120200160032200320064400394003911160201100991001001600001006202211012812311400451600001004004940049400504004940049
160204400483002376426160116100160016100160108500128019614002940049400491997691998616012820016003820032007640048400481116020110099100100160000100002221012812311400461600001004004940049400494004940050
1602044004830006426160116100160016100160028500128019614002940049400482001491998616012820016003820032007640048400481116020110099100100160000100002221012912311400461600001004004940049400494004940049
16020440048300064261601161001600161001600285001280196140029400494004819976101998616012820016003820032007640048400481116020110099100100160000100002221012812311400451600001004004940049400494004940049

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)c2branch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600244004030011752251600101016000010160000501280000115400204003940039199963200191600102016000020320000400394003911160021109101016000010000100228215164118840036155160000104004040040400404004040040
16002440039300069251600101016000010160000501280000110400204003940039199963200191600102016000020320000400394003921160021109101016000010000100228418162216640036155160000104004040040400404004040040
1600244003930013546251600101016000010160000501280000115400204003940039199963200191603252016000020320000400394003911160021109101016000010000100228418162116640036155160000104004040040400404004040040
16002440039300334625160010101600001016000050128000011540020400394003919996320019160010201600002032000040039400391116002110910101600001000010024114281621188400361510160000104004040040400404004040040
160024400393001246251600101016000010160000501280000015400204003940039199963200191600102016000020320000400394003911160021109101016000010000100228415162118940036155160000104004040040400404004040040
16002440039300052251600101016000010160000501280000115400204003940039199963200191600102016000020320000400394003911160021109101016000010000100228429162116640036155160000104004040040400404004040040
1600244003930010546251600101016000010160000501280000115400204003940039199963200191600102016000020320000400394003911160021109101016000010000100228416162116940036155160000104004040040400404004040040
16002440039300844625160010101600001016000050128000011540020400394003919996202001916001020160000203200004003940039111600211091010160000100001002284181621188400363010160000104004040040400404004040040
16002440039300126462516001010160000101600005012800001154002040039400391999632001916001020160000203200004003940039111600211091010160000100001002284161621167400363010160000104004040040400404004040040
1600244003930014146251600101016000010160000501280000115400204003940039199963200191600102016000020320000400394003911160021109101016000010000100228419164118840036155160000104004040040400404004040040