Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SRSRA (vector, 4S)

Test 1: uops

Code:

  srsra v0.4s, v1.4s, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03081e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004303723006125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723006125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723096125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303722006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230756125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037220216125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723006125482510001000100039831313018303730372415328951000100020003037303711100110000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  srsra v0.4s, v1.4s, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b181e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204300372250009757295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037224000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000203000071011611296340100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037224000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037224000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0318191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002430037225000612954825100101010000101000050427731330018300373003728287328767100102010000202000030037300371110021109101010000100006402162229630010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731330018300373003728287328767100102010000202000030037300371110021109101010000100006402162229630010000103003830038300383003830038
10024300372250018612954825100101010008101000050428002730018301323003728287328767100102010000202000030037300371110021109101010000100006402162229630010000103003830038300383003830038
10024300372250012612954843100101010000101000060427731330018300373003728287328767100102010000202000030037300371110021109101010000100006402162229630010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731330018300373003728287328767100102010000202000030037300371110021109101010000100006402162229630010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313300183003730037282872128767100102010000202000030037300371110021109101010000100006402162229630010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731330018300373003728287328767100102010000202000030037300371110021109101010000100006402162229630010000103003830038300383003830038
1002430037225003612954825100101010000101000050427731330018300373003728287328767100102010000202000030037300371110021109101010000100006402162229630010000103003830038300383003830038
1002430037225009612954825100101010000101000050427731330018300373003728287328767100102010000202000030037300371110021109101010000100006402162229630010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313300183003730037282873287671001020100002020000300373003711100211091010100001016306402162229630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  srsra v0.4s, v0.4s, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102043003722500124295472510100100100001001000050042771600300183003730037282716287401010020010008200200163003730037111020110099100100100001000011172222422296290100001003003830038300383003830038
102043003722501162295472510100100100001001000050042771600300183003730037282526287331010020010000200200003003730037111020110099100100100001000011172222422296290100001003003830038300383003830038
102043003722501120295472510100100100001001000051142771600300183003730037282526287331010020010000200200003003730037111020110099100100100001000011172222422296290100001003003830038300383003830038
102043003722501510295472510100100100001001000050042771600300183003730037282526287331010020010000200200003003730037111020110099100100100001000011172222422296290100001003003830038300383003830038
102043003722501476295474410100100100001001000050042771600300183003730037282526287331010020010000200200003003730037111020110099100100100001000011172222422296290100001003003830038300383003830038
102043003722401500295472510100100100001001000050042771600300183003730037282526287331010020010000200200003003730037111020110099100100100001000011172222422296290100001003003830038300383003830038
102043003722501143295472510100100100001001000050042771600300183003730037282526287331010020010000200200003003730037111020110099100100100001000011172222422296290100001003003830038300383003830038
102043003722501141295472510100100100001001000050042771600300183003730037282526287331010020010000200200003003730037111020110099100100100001000011172222422296290100001003003830038300383003830038
10204300372250112029547251010010010000100100005004277160030018300373003728252628733101002001000020020000300373003711102011009910010010000100013511172221722296290100001003003830038300383003830038
102043008422501141295472510100100100001001000050042771600300183003730037282526287331010020010000200200003003730037111020110099100100100001000011171801600296460100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03181e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100243003722500149295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722500495295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722500187295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250084295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722500302295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722500191295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722500660295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722500501295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001013640216222962910000103003830038300383003830038
100243003722500236295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722500191295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  srsra v0.4s, v8.4s, #3
  movi v1.16b, 0
  srsra v1.4s, v8.4s, #3
  movi v2.16b, 0
  srsra v2.4s, v8.4s, #3
  movi v3.16b, 0
  srsra v3.4s, v8.4s, #3
  movi v4.16b, 0
  srsra v4.4s, v8.4s, #3
  movi v5.16b, 0
  srsra v5.4s, v8.4s, #3
  movi v6.16b, 0
  srsra v6.4s, v8.4s, #3
  movi v7.16b, 0
  srsra v7.4s, v8.4s, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)03191f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420090151004622580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001005001111011921611200621600001002006620066200662006620066
16020420065150001382580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000001111011911621200621600001002006620066200662006620066
16020420065150005792580116100800161008002850064019602004520065200656128012820080028200160056201572006511160201100991001001600001000301111011911611200621600001002006620066200662006620066
16020420065150001592580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000001111011911611200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000001111011911611200621600001002006620066200662006620066
16020420065150001592580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000001111011911611200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000001111011911611200621600001002006620066200662006620066
16020420065151001192580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000001111011911611200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000001111011911611200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000001111011911611200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)03081e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024200561510001102580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000310027311220211352004315160000102004720047200472004720047
160024200461500009622580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010027311420211352004315160000102004720047200472004720047
160024200461500007172580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010027311520211532004315160000102004720047200472004720047
160024200461500009762580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010028311520211232004315160000102012620128201262012820047
16002420046150027010302580010108010510800005064000011200272004620046322800102080000201600002004620046111600211091010160000100201010052311520211342004315160000102004720047200472004720047
1600242004615000010322580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010028311520211442004315160000102004720047200472004720047
1600242004615000010702580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000310028311320211442004315160000102004720047200472004720047
160024200461510003442580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010028311520211442004315160000102004720047200472004720047
160024200461500008542580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100000010027311420211442004315160000102004720047200472004720047
16002420046150012010082580010108000010800005064000011200312004620046322800102080000201600002004620046111600211091010160000100000010026311320211442004315160000102004720047200472004720047

Test 5: throughput

Count: 16

Code:

  srsra v0.4s, v16.4s, #3
  srsra v1.4s, v16.4s, #3
  srsra v2.4s, v16.4s, #3
  srsra v3.4s, v16.4s, #3
  srsra v4.4s, v16.4s, #3
  srsra v5.4s, v16.4s, #3
  srsra v6.4s, v16.4s, #3
  srsra v7.4s, v16.4s, #3
  srsra v8.4s, v16.4s, #3
  srsra v9.4s, v16.4s, #3
  srsra v10.4s, v16.4s, #3
  srsra v11.4s, v16.4s, #3
  srsra v12.4s, v16.4s, #3
  srsra v13.4s, v16.4s, #3
  srsra v14.4s, v16.4s, #3
  srsra v15.4s, v16.4s, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire (01)cycle (02)030708090a0b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020440060300100100000030251601081001600081001600205001280132014002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000001111011811621400361600001004004040040400404004040040
160204400393001001000000505251601081001600081001600205001280132014002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000001111011811621400361600001004004040040400404004040040
160204400393001001000180030251601081001600081001600205001280132014002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000001111011811621400361600001004004040040400404004040040
16020440039300100100000030251601081001600081001600205001280132014002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000301111011811632400361600001004004040040400404004040040
160204400393001001000000302516010810016000810016002050012801320140020400394003919977261999016012020016003220032006440039400391116020110099100100160000100000000001111011811631400361600001004004040040400404004040040
16020440039300100100000030251601081001600081001600205001280132004002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000001111011811621400361600001004004040040400404004040040
160204400392991001000000125251601081001600081001600205001280132004002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000001111011811621400361600001004004040040400404004040040
16020440039300100100000030251601081001600081001600205001280132014002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000001111011811631400361600001004004040040400404004040040
16020440039299100100000030251601081001600081001600205001280132014002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000001111011811621400361600001004004040040400404004040040
16020440039300100100000030251601081001600081001600205001280132004002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000001111011811621400361600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03041e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acbranch mispredict (cb)cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600244005030010010925160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000100223125164119340036155160000104004040040400404004040040
1600244003930000011325160010101600001016000050128000011400204003940039199963200191601282016000020320000400394003911160021109101016000010000100223113162115540036155160000104004040040400404004040040
160024401013000004625160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000100223119162114340036155160000104004040040400404004040040
1600244003930000046251600101016000010160000501280000114002040039400391999632001916001020160000203200004003940039111600211091010160000100001002231151621112340036155160000104004040040400404004040040
160024400393000004625160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000100223113162116640036155160000104004040040400404004040040
1600244003930000051251600101016000010160000501280000114002040039400391999632001916001020160000203200004003940039111600211091010160000100001002231151621111340036155160000104004040040400404004040040
1600244003930000052125160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000100223115162115940036155160000104004040040400404004040040
160024400392990004625160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000100223113162117340036155160000104004040040400404004040040
16002440039299000101925160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000100223119162115940036155160000104004040040400404004040040
160024400393000004625160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000100223113162115940036155160000104004040040400404004040093