Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UADALP (vector, 2S)

Test 1: uops

Code:

  uadalp v0.2s, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03l2 tlb miss data (0b)191e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb miss (a1)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100430372300186125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037220006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230096125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
100430372200156125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037220006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  uadalp v0.2s, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)0309l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020430037225000000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000200071011611296340100001003003830038300383003830038
1020430037225000000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731303001830084300372826532874510100200100002002000030037300371110201100991001001000010000004030071022611296340100001003003830038300383003830038
10204300372250000000103295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037225000000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003008630038300383003830038
1020430037225000000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000100071011611296347100001003003830038300383003830038
1020430037225000000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000200071011611296340100001003003830038300383003830038
1020430037225000000061295482510100100100081001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000503071011611296340100001003003830038300383003830038
1020430037225000000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037224000000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225000006129548361001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000102418640516452963010000103003830038300383003830038
10024300372240003010329548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000102612640516332963010000103003830038300383003830038
1002430037226100911032954825100101110016101000055427731313001830084301802828710287851001020100002020000300373003711100211091010100001013640416452963010000103003830038300383003830038
1002430037225000001032954825100101010000101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010193640416542963010000103003830038300383003830038
10024300372250000061295482510038131000010100005042773131300183003730037282873287671001020100002020000300373003711100211091010100001020649516552963010000103003830038300383003830038
10024300372250000061295482510010101000010100005042773131300183003730037282873287671001020100002020000300373003711100211091010100001000640516552969910000103003830038300383003830038
10024300372250000061295482510010101000010100005042773131300183003730037282873287671001020100002020000300373003711100211091010100001020640416452963010000103003830038300383003830038
100243003722500000612954825100101010000101000050427731313001830037300372828732876710010201000020203443003730070111002110910101000010219640516552963010000103003830038300383003830038
10024300372250000061295482510044111000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001010640416552963010000103003830038300383003830038
10024300372240000061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001030640316452963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  uadalp v0.2s, v0.4h
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)1e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372320006129547251010010010000100100005004277160300183003730037282520628733101002001000020020000300373003711102011009910010010000100101117222242229629100001003003830038300383003830038
10204300372250009729547251010010010000100100005004277160300183003730037282520628733101002001000020020000300373003711102011009910010010000100101117222242229629100001003003830038300383003830038
10204300372250009729547251010010010000100100005004277160300183003730037282520628733101002001000020020000300373003711102011009910010010000100101117222242229629100001003003830038300383003830038
10204300372250009729547251010010010000100100005004277160300183003730037282710628740101002001000820020016300373003711102011009910010010000100101117170160029646100001003003830038300383003830038
102043003722500061295472510142100100001001000050042771603001830037300372827107287411010020010008200200163003730037111020110099100100100001002691117180160029645100001003003830038300383003830038
10204300372250006129547251010010010000100100005004277160300183003730037282710628740101002001000820020016300373003711102011009910010010000100201117170160029645100001003003830038300383003830038
10204300372250006129547251010010010000100100005164277160300183003730037282710728741101002001000820020016300373003711102011009910010010000100301117180160029646100001003003830038300383003830038
1020430037225000579929547251010010010000100100005004277160300183003730037282710628741101002001000820020016300373003711102011009910010010000100401117170160029645100001003003830038300383003830038
10204300372250006129547251010010010000100100005004277160300183003730037282710728740101002001000820020016300373003711102011009910010010000100401117180160029645100001003003830038300383003830038
102043003722400061295472510110108100001001000050042771603001830037300372825206287331010020010000200200003003730037111020110099100100100001002101117222242229629100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000008229547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372240000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001010640216222962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  uadalp v0.2s, v8.4h
  movi v1.16b, 0
  uadalp v1.2s, v8.4h
  movi v2.16b, 0
  uadalp v2.2s, v8.4h
  movi v3.16b, 0
  uadalp v3.2s, v8.4h
  movi v4.16b, 0
  uadalp v4.2s, v8.4h
  movi v5.16b, 0
  uadalp v5.2s, v8.4h
  movi v6.16b, 0
  uadalp v6.2s, v8.4h
  movi v7.16b, 0
  uadalp v7.2s, v8.4h
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204200881510553258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
160204200651500284258011610080016100800285006401961200452014720146612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
160204200651506329258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150092258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
160204200651500207258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065151029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)031e3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420069150004442580010108000010800005064000001200312005020050322800102080000201600002005020050111600211091010160000100000010027622424412442025130160000102004720051200512004720051
1600242005015000722580010108000010800005064000011200272005020046322800102080133201600002004620050111600211091010160000100000010030622424422552024830160000102004720047200472004720047
160024200461550037932580010108000010800005064000011200312004620046322800102080000201600002005020046111600211091010160000100000010026311420211442023615160000102004720047200472004720047
1600242004615000452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010027311420211442019515160000102005120047200512005120047
16002420046150002162580010108000010800005064000011200272004620046322800102080000201602662004620046111600211091010160000100000010027311420211542021315160000102004720047200472004720047
16002420046150007102580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010028311520211452021515160000102004720047200472004720047
1600242004615000452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010026311220211532019115160000102004720047200472004720047
1600242004615000872580010108000010800005064000011200272004620050322800102080000201600002004620046111600211091010160000100000010027311420211442023515160000102004720047200472004720047
1600242004615000452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010026311220211342020815160000102004720047200472004720047
1600242004615000452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010027311420211332021215160000102004720047200472004720047

Test 5: throughput

Count: 16

Code:

  uadalp v0.2s, v16.4h
  uadalp v1.2s, v16.4h
  uadalp v2.2s, v16.4h
  uadalp v3.2s, v16.4h
  uadalp v4.2s, v16.4h
  uadalp v5.2s, v16.4h
  uadalp v6.2s, v16.4h
  uadalp v7.2s, v16.4h
  uadalp v8.2s, v16.4h
  uadalp v9.2s, v16.4h
  uadalp v10.2s, v16.4h
  uadalp v11.2s, v16.4h
  uadalp v12.2s, v16.4h
  uadalp v13.2s, v16.4h
  uadalp v14.2s, v16.4h
  uadalp v15.2s, v16.4h
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)181e373f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602044006030000001739251601091001600081001600205001280132400294004940039199776199911601202001600322003200644004040049111602011009910010016000010000011110118016004003601600001004004940049400494004040040
1602044004930000001831251601171001600091001600205001320129400214004040049199776199901601202001600322003200644004940039111602011009910010016000010000011110118016004003701600001004004040050400404004040050
1602044004030000001730251601091001600091001600205001280132400204004040049199776199901601202001600322003200644004840039111602011009910010016000010000011110118016004003601600001004005040040400404004040040
160204400393000000096251601171001600081001600205002399131400294003940039199776199991601202001600322003200644003940040111602011009910010016000010040011110118016004003601600001004004040041400504004940040
160204400403000000130251601091001600081001600205002399159400294003940049199776199991601202001600322003200644004040039111602011009910010016000010000011110118016004004501600001004004040040400404005040040
1602044003930000001731251601081001600091001600205002399159400294012840039199776199911601202001600322003200644004940039111602011009910010016000010000011110118016004003601600001004004040040400414005040049
160204400492990000031251601081001600091001600205001320129400214003940040199776200001601202001600322003200644003940039111602011009910010016000010000011110118016004004501600001004004040050400404004140041
160204400493000000031251601171001600091001600205001280132400214004040049199776199901601202001600322003200644004940039111602011009910010016000010001011110118016004003601600001004004040040400504005040040
1602044003930000001302516011710016001710016002050012801324002140039400401997761999016012020016003220032006440049401141116020110099100100160000100045011110118016004004601600001004004040040400414004140040
1602044004930000001864271601341001600171001600285002387995400384004840048199769199861601282001600382003200764004940048111602011009910010016000010000022210129123114004501600001004005040051400514005840050

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e373f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600244004930000046251600111016000010160000502398999115400200400494004919996320029160010201600002032000040039400491116002110910101600001000100228111016211111240045155160000104004040049400404004940040
1600244004930000175525160027101600011016000050128000011540020040048400481999632002816001020160000203200004004940039111600211091010160000100010022841111622251140046156160000104005040040400504005040041
1600244003929900071125160010101600001016000050128000011540020040040400481999632001916001020160000203200004004940048111600211091010160000100010022852101642181340045155160000104004140049400504004040050
160024400403000018622516001110160000101600005023990270154003004003940040199963200291600102016000020320000400394004011160021109101016000010001002411521016422101140036155160000104004040050400404005040040
1600244004029900175625160011101600001016000050237891011540020040048400391999632001916001020160000203200004004040049111600211091010160000100010022841121621191240037156160000104004940050400404004940049
160024400483000004625160011101600181016000050128000011540020040048400391999632002016001020160000203200004004940049111600211091010160000100010022841121621161440036155160000104004040050400494004940050
1600244004930000184725160027101600001016000050128000011540030040040400401999632002016001020160000203200004004840048111600211091010160000100010022841101621171240037156160000104004040041400404005040040
1600244005130000184725160011101600801016000050128000011540020040039400491999632002916001020160000203200004004940039111600211091010160000100310022841101621171140045155160000104004140050400404004140040
1600244004030100061625160027101600171016000050128000011540020040039400491999632002016001020160000203202804003940039111600211091010160000100010022841101621171340036156160000104005040040400414004040041
16002440048299001756251600271016001710160000502399027115400210400494003919996320020160010201600002032000040049400401116002110910101600001000100228411216211101140046155160000104004040050400414005040040