Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

USQADD (scalar, H)

Test 1: uops

Code:

  usqadd h0, h1
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004303723006125482510001000100039831311530183037303724153289510001000200030373037111001100007315216222630100030383038303830383038
100430372339061254825100010001000398313103018303730372415328951000100020003037303711100110000730216222630100030383038303830383038
100430372400612548251000100010003983131030183037303724153289510001000200030373037111001100007315216222630100030383038303830383038
100430372312061254825100010001000398313103018303730372415328951000100020003037303711100110000730216222630100030383038303830383038
10043037243010325482510001000100039831301530183037303724153289510001000200030373037111001100007315216222630100030383038303830383038
10043037230061254825100010001000398313103018303730372415328951000100020003037303711100110000730216222630100030383038303830383038
1004303724141061254825100010001000398313103018303730372415328951000100020003037303711100110000730216222630100030383038303830383038
1004303724006125482510001000100039831301530183037303724153289510001000200030373037111001100007315216222630100030383038303830383038
1004303724006125482510001000100039831301530183037303724153289510001000200030373037111001100007315216222630100030383038303830383038
100430372300722548251000100010003983131030183037303724153289510001000200030373037111001100007315216222630100030383038303830383038

Test 2: Latency 1->1

Code:

  usqadd h0, h1
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fst unit uop (a7)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0eb? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003723200612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003723200662954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003723300612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003723200612954825101001001000010010000500427867013001830037300372826532874510100200100002002000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003723300612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000371011611296340100001003003830038300383003830038
102043003724100612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
1020430037233015612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
1020430037233004692954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003723200612954825101301001000010010000500427731303001830037300372826532874510100200100002002000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003724100612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372330612954825100101010000101000050427731313001803003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
10024300372330612954825100101010000101000050427731313001803003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
10024300372320612954825100101010000101000050427731303001803003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
10024300372330612954825100101010000101000050427731313001803003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
10024300372330892954825100101010000101000050427731303001803003730037282873287671001020100002020000300373003711100211091010100001010640216222963010000103003830038300383003830038
10024300372330612954825100101010000101000050427731303001803003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
10024300372420612954825100101010000101000050427731313001803003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
100243003723208542954825100101010000101000050427731313001803003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
10024300372320612954825100101010000101000050427731313001803003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
10024300372330612954825100101010000101000050427731313001803003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  usqadd h0, h0
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372330000000066295472510100100100001001015050042771600300183003730037282717287401010020010008214200163003730037111020110099100100100001000000147011171811631296450100001003003830038300383003830038
10204300372330000000157229547251010010010000100100005004277160030018300373003728271242874010100200100082002001630037300371110201100991001001000010000003011171811611296460100001003003830038300383003830038
1020430037233000000006129547251010010010000100100005004277160030018300373003728271728741101002001000820020016300373003711102011009910010010000100000370011171711611296450100001003003830038300383003830038
102043003723300000000612954725101001001000010010000500427716003001830037300372827172874110100200100082002001630037300371110201100991001001000010000003011171811611296450100001003003830038300383003830038
102043003723300000000612954725101001001000010010000500427716003001830037300372827172874010100200100082002001630037300371110201100991001001000010000009011171811611296450100001003003830038300383003830038
10204300372330000092641972954725101001001000010010000511427716003001830037300372825262873310100200100082002001630037300371110201100991001001000010000029011171811612296460100001003003830038300383003830038
1020430037233000000005302954725101001001000010010000500427716003001830037300372827162874110100200100082002001630037300371110201100991001001000010000020411171711611296460100001003003830038300383003830038
102043003723200000000612954725101001001000010010000500427716003001830037300372827172874110100200100082002001630037300371110201100991001001000010000003011171811611296460100001003003830038300383003830038
102043003723200000000612954725101001001000010010000500427716003009030037300372827162874110100200100082002001630037300371110201100991001001000010000003011171711611296450100001003003830038300383003830038
102043003723300000000612954725101001001000010010000500427716003001830037300372827162874110100200100082002001630037300371110201100991001001000010000003011171811611296450100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003723300000025129547251001010100001010000504277160130018300373003728286328767100102010000202000030037300371110021109101010000100000057064021622296290210000103003830038300383003830038
100243003723300000061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000000126064021622296290010000103003830085300383003830038
100243003723200001205362954725100101010000101000050427716003001830037300372828632876710010201000020200003003730037111002110910101000010000000064021622296290010000103003830038300383003830038
10024300372330000006129547251001010100001010000504277160030018300373003728286328767100102010000202000030037300371110021109101010000100000072064021622296290010000103003830038300383003830038
100243003723301000061295472510010101000010100005042771600302703032230274283113628899110642211316222229230401304058110021109101010000102221419788279558043299011010000103037030368303703041630414
1002430367235109812007926868294661861005012100721211350504281216030342302613041528318392885711364201000020200003008430037111002110910101000010000006081645627297752010000103026130324303703018030181
1002430460233107580461661295476210028101002410100007242771601301263003730037283072728767100102411082202360630273303709110021109101010000104220021051283321664300981010000103060730323303663064230369
100243055624500004806129547251001010100001010000504277160030018300373003728286328767100102010000202000030037300371110021109101010000100000093064021622296290010000103003830038300383003830038
1002430037233000000612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010000003064021622296290010000103003830038300383003830038
100243003723400000010329547251001010100001010000504277160030018300373003728286328767100102010000202000030037300371110021109101010000100000099064021622296290010000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  usqadd h0, h8
  movi v1.16b, 0
  usqadd h1, h8
  movi v2.16b, 0
  usqadd h2, h8
  movi v3.16b, 0
  usqadd h3, h8
  movi v4.16b, 0
  usqadd h4, h8
  movi v5.16b, 0
  usqadd h5, h8
  movi v6.16b, 0
  usqadd h6, h8
  movi v7.16b, 0
  usqadd h7, h8
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204200651660000002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000003011110119016002006201600001002006620066200662006620066
160204200651550000002925801161008001610080028500640196020045200652006561280128200800282001600562006520065111602011009910010016000010000000011110119016002006201600001002006620066200662006620066
160204200651550000002925801161008001610080028500640196020045200652006561280128200800282001600562006520065111602011009910010016000010002003011110119016002006201600001002006620066200662006620066
160204200651560000302925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000000011110119016002006201600001002006620066200662006620066
1602042006515500000031425801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000000011110119016002006201600001002006620066200662006620066
160204200651550000002925801161008001610080028500640196120661204672056561280128200800282001600562006520065111602011009910010016000010000000011110119016002006201600001002006620066200662006620066
160204200651560000005725801161008001610080028500640196020045200652006561280128200800282001600562006520065111602011009910010016000010000010011110142016002006201600001002006620066200662014820066
160204200651560000002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000000011110119016002006201600001002006620066200662006620066
160204200651560000002925801161008001610080028500640196020045200652006561280128200800282001600562006520065111602011009910010016000010000000011110119016002006201600001002006620066200662006620066
160204200651560000002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000003011110119016002006201600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)030e1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024200711550045258001010800001080000506416801120028200462004632280010208000020160000200502005011160021109101016000010531003331116202116102004315160000102029220055200632004720047
160024200461550045258001010800001080000506400000120031200462004632280010208000020160000200462004621160021109101016000010001003331110202111062004315160000102028220059200472004720047
1600242004615600452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000101491002931110202111062004315160000102027520055200472004720047
1600242004615500452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000102001004031110222111062004315160000102025020059200512005120047
1600242004615600452580010108000010800005064000011200272004620046322800102080000201600002005020046111600211091010160000102701002931110202111062004315160000102023320055200472005120051
1600242005015500452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000101501002931110202116102004315160000102024820055200472005520047
160024200461550045258001010800001080000506400001120027200462004632280010208000020160000200462004611160021109101016000010001003331162021110102004315160000102022320055200472004720047
160024200461620045258042910800001080000506400001120027200462004632280010208000020160000200462004611160021109101016000010001003331110202111062004315160000102023120063200472004720047
160024200461560045258001010800001080000506400001120027200462004632280010208000020160000200462004611160021109101016000010701003331162021110102004315160000102025320055200472004720047
16002420046155007102580010108000010800005064000011200272004620046322800102080000201600002014820147111600211091010160000102601002931110202111062004315160000102022520055200472004720047

Test 5: throughput

Count: 16

Code:

  usqadd h0, h16
  usqadd h1, h16
  usqadd h2, h16
  usqadd h3, h16
  usqadd h4, h16
  usqadd h5, h16
  usqadd h6, h16
  usqadd h7, h16
  usqadd h8, h16
  usqadd h9, h16
  usqadd h10, h16
  usqadd h11, h16
  usqadd h12, h16
  usqadd h13, h16
  usqadd h14, h16
  usqadd h15, h16
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)031e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020440049300375030251601081001600081001600205001280132140020400394003919977619990160120200160032200320064400394003911160201100991001001600001000000111101181600400361600001004004040040400404004040040
16020440039300120030251601081001600081001600205001280132140020400394003919977619990160120200160032200320064400394008911160201100991001001600001000000111101181600400361600001004004040040400404004040040
16020440089300126030251601081001600081001600205001280132140020400394003919977619990160120200160032200320064400394003911160201100991001001600001000000111101181600400361600001004004040040400404004040040
16020440039300135030251601081001600081001600205001280132140020400394003919977619990160120200160032200320064400394003911160201100991001001600001000002111101351600400361600001004004040040400404004040040
16020440039300120030251601081001600081001600205001280132140020400394003919977619990160120200160032200320064400394003911160201100991001001600001000000111101541600400361600001004004040040400404004040040
1602044003930015610830251601081001600081001600205001280132140020400394003919977619990160120200160032200320064400394003911160201100991001001600001000000111101181600400361600001004004040040400404004040040
1602044003930096030251601081001600081001600205001280132140020400394003919977619990160120200160032200320064400394003911160201100991001001600001000000111101181600400361600001004004040040400404004040040
16020440039300168030251601081001600081001600205001280132140020400394003919977619990160120200160144200320064400394003911160201100991001001600001000000111101181601400361600001004004040040400404004040040
160204400393000030251601081001600081001600205001280132140020400394003919977619990160120200160032200320064400394003911160201100991001001600001000000111101181600400361600001004004040040400404004040040
16020440039299165030251601081001600081001600205001280132140020400394003919977619990160120200160032200320064400394003911160201100991001001600001000000111101181600400361600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03l1i tlb fill (04)191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024400393101000012352516001010160000101600005012800000154002040039400391999632001916001020160000203200004003940039111600211091010160000100017539360100238510411611127254003603110160000104004040040400404004040040
16002440039310009014625160010101600001016000050128000011540020400394003919996320019160010201600002032000040039400391116002110910101600001000000010023851026161112227400360165160000104004040040400404004040040
16002440039310000016925160010101600001016000050128000011540020400394003919996320019160010201600002032000040039400391116002110910101600001000000010023851027161112224400360165160000104004040040400404004040040
160024400393110000215525160010101600001016000050128000011540020400394003919996320019160010201600002032000040039400391116002110910101600001000000010023851027161112627400360165160000104004040040400404004040040
16002440039317009014625160010101600001016000050128000001540020400394003919996320019160010201600002032000040039400391116002110910101600001000000010023851021161112521400360165160000104004040040400404004040040
160024400393100000169251600101016000010160000501280000115400204003940039199963200191600102016000020320000400394003911160021109101016000010000000100238510261631227254003603110160000104004040040400404004040040
16002440039310000019925160010101600001016000050128000011540020400394003919996320019160010201600002032000040039400391116002110910101600001000000010023851024161112225400360165160000104004040040400404004040040
160024400393100000150225160010101600001016000050128000011540020400394003919996320019160010201600002032000040039400391116002110910101600001000000010022851017161112519400360165160000104004040040400404004040040
160024400393100000117925160010101600001016000050128000011540020400394003919996320019160010201600002032000040039400391116002110910101600001000000010023851026161112717400360165160000104004040040400404004040040
160024400393100000211548251600101016000010160000501280000010400204003940039199963200191600102016000020320000400394003911160021109101016000010000000100243311261632221254003603110160000104004040040400404004040040