Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

UADALP (vector, 2S)

Test 1: uops

Code:

  uadalp v0.2s, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)030b191e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a1accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
100430372300186125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037220006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230096125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
100430372200156125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037220006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  uadalp v0.2s, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03090b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020430037225000000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000200071011611296340100001003003830038300383003830038
1020430037225000000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731303001830084300372826532874510100200100002002000030037300371110201100991001001000010000004030071022611296340100001003003830038300383003830038
10204300372250000000103295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037225000000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003008630038300383003830038
1020430037225000000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000100071011611296347100001003003830038300383003830038
1020430037225000000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000200071011611296340100001003003830038300383003830038
1020430037225000000061295482510100100100081001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000503071011611296340100001003003830038300383003830038
1020430037225000000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037224000000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)030818191e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002430037225000006129548361001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000102418640516452963010000103003830038300383003830038
10024300372240003010329548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000102612640516332963010000103003830038300383003830038
1002430037226100911032954825100101110016101000055427731313001830084301802828710287851001020100002020000300373003711100211091010100001013640416452963010000103003830038300383003830038
1002430037225000001032954825100101010000101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010193640416542963010000103003830038300383003830038
10024300372250000061295482510038131000010100005042773131300183003730037282873287671001020100002020000300373003711100211091010100001020649516552963010000103003830038300383003830038
10024300372250000061295482510010101000010100005042773131300183003730037282873287671001020100002020000300373003711100211091010100001000640516552969910000103003830038300383003830038
10024300372250000061295482510010101000010100005042773131300183003730037282873287671001020100002020000300373003711100211091010100001020640416452963010000103003830038300383003830038
100243003722500000612954825100101010000101000050427731313001830037300372828732876710010201000020203443003730070111002110910101000010219640516552963010000103003830038300383003830038
10024300372250000061295482510044111000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001010640416552963010000103003830038300383003830038
10024300372240000061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001030640316452963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  uadalp v0.2s, v0.4h
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03081e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204300372320006129547251010010010000100100005004277160300183003730037282520628733101002001000020020000300373003711102011009910010010000100101117222242229629100001003003830038300383003830038
10204300372250009729547251010010010000100100005004277160300183003730037282520628733101002001000020020000300373003711102011009910010010000100101117222242229629100001003003830038300383003830038
10204300372250009729547251010010010000100100005004277160300183003730037282520628733101002001000020020000300373003711102011009910010010000100101117222242229629100001003003830038300383003830038
10204300372250009729547251010010010000100100005004277160300183003730037282710628740101002001000820020016300373003711102011009910010010000100101117170160029646100001003003830038300383003830038
102043003722500061295472510142100100001001000050042771603001830037300372827107287411010020010008200200163003730037111020110099100100100001002691117180160029645100001003003830038300383003830038
10204300372250006129547251010010010000100100005004277160300183003730037282710628740101002001000820020016300373003711102011009910010010000100201117170160029645100001003003830038300383003830038
10204300372250006129547251010010010000100100005164277160300183003730037282710728741101002001000820020016300373003711102011009910010010000100301117180160029646100001003003830038300383003830038
1020430037225000579929547251010010010000100100005004277160300183003730037282710628741101002001000820020016300373003711102011009910010010000100401117170160029645100001003003830038300383003830038
10204300372250006129547251010010010000100100005004277160300183003730037282710728740101002001000820020016300373003711102011009910010010000100401117180160029645100001003003830038300383003830038
102043003722400061295472510110108100001001000050042771603001830037300372825206287331010020010000200200003003730037111020110099100100100001002101117222242229629100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000008229547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372240000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250000006129547251001010100001010000504277160300183003730037282863287671001020100002020000300373003711100211091010100001010640216222962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  uadalp v0.2s, v8.4h
  movi v1.16b, 0
  uadalp v1.2s, v8.4h
  movi v2.16b, 0
  uadalp v2.2s, v8.4h
  movi v3.16b, 0
  uadalp v3.2s, v8.4h
  movi v4.16b, 0
  uadalp v4.2s, v8.4h
  movi v5.16b, 0
  uadalp v5.2s, v8.4h
  movi v6.16b, 0
  uadalp v6.2s, v8.4h
  movi v7.16b, 0
  uadalp v7.2s, v8.4h
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204200881510553258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
160204200651500284258011610080016100800285006401961200452014720146612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
160204200651506329258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150092258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
160204200651500207258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065151029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)031e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002420069150004442580010108000010800005064000001200312005020050322800102080000201600002005020050111600211091010160000100000010027622424412442025130160000102004720051200512004720051
1600242005015000722580010108000010800005064000011200272005020046322800102080133201600002004620050111600211091010160000100000010030622424422552024830160000102004720047200472004720047
160024200461550037932580010108000010800005064000011200312004620046322800102080000201600002005020046111600211091010160000100000010026311420211442023615160000102004720047200472004720047
1600242004615000452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010027311420211442019515160000102005120047200512005120047
16002420046150002162580010108000010800005064000011200272004620046322800102080000201602662004620046111600211091010160000100000010027311420211542021315160000102004720047200472004720047
16002420046150007102580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010028311520211452021515160000102004720047200472004720047
1600242004615000452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010026311220211532019115160000102004720047200472004720047
1600242004615000872580010108000010800005064000011200272004620050322800102080000201600002004620046111600211091010160000100000010027311420211442023515160000102004720047200472004720047
1600242004615000452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010026311220211342020815160000102004720047200472004720047
1600242004615000452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010027311420211332021215160000102004720047200472004720047

Test 5: throughput

Count: 16

Code:

  uadalp v0.2s, v16.4h
  uadalp v1.2s, v16.4h
  uadalp v2.2s, v16.4h
  uadalp v3.2s, v16.4h
  uadalp v4.2s, v16.4h
  uadalp v5.2s, v16.4h
  uadalp v6.2s, v16.4h
  uadalp v7.2s, v16.4h
  uadalp v8.2s, v16.4h
  uadalp v9.2s, v16.4h
  uadalp v10.2s, v16.4h
  uadalp v11.2s, v16.4h
  uadalp v12.2s, v16.4h
  uadalp v13.2s, v16.4h
  uadalp v14.2s, v16.4h
  uadalp v15.2s, v16.4h
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire (01)cycle (02)03080b181e373f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602044006030000001739251601091001600081001600205001280132400294004940039199776199911601202001600322003200644004040049111602011009910010016000010000011110118016004003601600001004004940049400494004040040
1602044004930000001831251601171001600091001600205001320129400214004040049199776199901601202001600322003200644004940039111602011009910010016000010000011110118016004003701600001004004040050400404004040050
1602044004030000001730251601091001600091001600205001280132400204004040049199776199901601202001600322003200644004840039111602011009910010016000010000011110118016004003601600001004005040040400404004040040
160204400393000000096251601171001600081001600205002399131400294003940039199776199991601202001600322003200644003940040111602011009910010016000010040011110118016004003601600001004004040041400504004940040
160204400403000000130251601091001600081001600205002399159400294003940049199776199991601202001600322003200644004040039111602011009910010016000010000011110118016004004501600001004004040040400404005040040
1602044003930000001731251601081001600091001600205002399159400294012840039199776199911601202001600322003200644004940039111602011009910010016000010000011110118016004003601600001004004040040400414005040049
160204400492990000031251601081001600091001600205001320129400214003940040199776200001601202001600322003200644003940039111602011009910010016000010000011110118016004004501600001004004040050400404004140041
160204400493000000031251601171001600091001600205001280132400214004040049199776199901601202001600322003200644004940039111602011009910010016000010001011110118016004003601600001004004040040400504005040040
1602044003930000001302516011710016001710016002050012801324002140039400401997761999016012020016003220032006440049401141116020110099100100160000100045011110118016004004601600001004004040040400414004140040
1602044004930000001864271601341001600171001600285002387995400384004840048199769199861601282001600382003200764004940048111602011009910010016000010000022210129123114004501600001004005040051400514005840050

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)030b1e373f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600244004930000046251600111016000010160000502398999115400200400494004919996320029160010201600002032000040039400491116002110910101600001000100228111016211111240045155160000104004040049400404004940040
1600244004930000175525160027101600011016000050128000011540020040048400481999632002816001020160000203200004004940039111600211091010160000100010022841111622251140046156160000104005040040400504005040041
1600244003929900071125160010101600001016000050128000011540020040040400481999632001916001020160000203200004004940048111600211091010160000100010022852101642181340045155160000104004140049400504004040050
160024400403000018622516001110160000101600005023990270154003004003940040199963200291600102016000020320000400394004011160021109101016000010001002411521016422101140036155160000104004040050400404005040040
1600244004029900175625160011101600001016000050237891011540020040048400391999632001916001020160000203200004004040049111600211091010160000100010022841121621191240037156160000104004940050400404004940049
160024400483000004625160011101600181016000050128000011540020040048400391999632002016001020160000203200004004940049111600211091010160000100010022841121621161440036155160000104004040050400494004940050
1600244004930000184725160027101600001016000050128000011540030040040400401999632002016001020160000203200004004840048111600211091010160000100010022841101621171240037156160000104004040041400404005040040
1600244005130000184725160011101600801016000050128000011540020040039400491999632002916001020160000203200004004940039111600211091010160000100310022841101621171140045155160000104004140050400404004140040
1600244004030100061625160027101600171016000050128000011540020040039400491999632002016001020160000203202804003940039111600211091010160000100010022841101621171340036156160000104005040040400414004040041
16002440048299001756251600271016001710160000502399027115400210400494003919996320020160010201600002032000040049400401116002110910101600001000100228411216211101140046155160000104004040050400414005040040