Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SRSRA (vector, 8H)

Test 1: uops

Code:

  srsra v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
100430372301052548251000100010003983131301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
10043037220612548251000100010003983130301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
10043037230612548251000100010003983130301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
10043037220612548251000100010003983131301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
1004303722267612548251000100010003983131301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
10043037230612548251000100010003983130301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
10043037230612548251000100010003983130301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
10043037230612548251000100010003983131301830373037241532895100010002000303730371110011000373116112630100030383038303830383038
10043037220612548251000100010003983130301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
10043037230612548251000100010003983130301830373037241532895100010002000303730371110011000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  srsra v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03181e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a7a8acc2cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102043003722500072629548251010010010000100100005004277313300183003730037282653287621010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
1020430037225043206129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
102043003722502106129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007100161129634100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
10204300372240006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
10204300372240006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0308090b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024300372250000019206129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000064041644296300010000103003830038300383003830038
10024300372250000093308129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000064041634296300010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000064031634296300010000103003830038300383003830038
1002430037225000000069329548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000064031634296300010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000064041643296300010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000064031644296300010000103003830038300383003830038
10024300372250000075606129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000064041644296300010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000064041634296300010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000064041644296300010000103003830038300383003830038
10024300372240000055206129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300851110021109101010000100000000064041634296300110000103003830038300383003830038

Test 3: Latency 1->2

Code:

  srsra v0.8h, v0.8h, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0307090a18191e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a7a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204300372251010000612954725101001001000010010000500427716030018300373003728271628741101002001000820020016300373003711102011009910010010000100000000011171811611296500100001003003830038300383003830038
10204300372251010000612954725101001001000010010000500427761830018300373003728271628741101002001000820020016300373003711102011009910010010000100000100011171711611296490100001003003830038300383003830038
10204300372251010000612954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102011009910010010000100000000011172242444296290100001003003830038300383003830038
10204300372250000001972954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102021009910010010000100000000011172242443296290100001003003830038300383003830038
10204300372250000001972954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102011009910010010000100000000001172242434296290100001003003830038300383003830038
10204300372240000001972954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102011009910010010000100000000011172212444296290100001003003830038300383003830038
10204300372250000001972954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102011009910010010000100000000011172242443296290100001003003830038300383003830038
10204300372250000001972954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102011009910010010000100000000011172232414296290100001003003830038300383003830038
102043003722500000012682954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102011009910010010000100000000011172242434296290100001003003830038300383003830038
10204300372250000001972954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102011009910010010000100000003011172242434296290100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)030b181e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100243003722500061295472510010101000010100005042771601300180300373003728286328767100102010000202000030037300371110021109101010000100000640216222962910000103003830038300383003830038
100243003722500061295472510010101000010100005042771601300180300373003728286828786100102010000202000030037300371110021109101010000100000640216222962910000103003830038300383003830038
1002430037225000612954725100101010000101000050427716013001803003730037282863287671001020100002020000300373003711100211091010100001002400640216222962910000103003830038300383003830038
100253003722500061295472510010101000010100005042771601300180300373003728286328767100102010000202000030037300371110021109101010000100000640216222962910000103003830038300383003830038
100243003722500061295472510010101000010100005042771601300180300373003728286328767100102010000202000030037300371110021109101010000100000640216222962910000103003830038300383003830038
100243003722500061295472510010101000010100005042771600300180300373003728286328767100102010000202000030037300371110021109101010000100000640216222962910000103003830038300383003830038
100243003722500061295472510010101000010100005042771601300180300373003728286828767100102010000202000030037300371110021109101010000100000640216222962910000103003830038300383003830038
100243003722500061295472510010101000010100005042771600300180300373003728286328767100102010000202000030037300371110021109101010000100000640216222962910000103003830038300383003830038
100243003722500061295472510010101000010100005042771600300180300373003728286328767100102010000202000030037300371110021109101010000100000640216222962910000103003830038300383003830038
100243003722500061295472510010101000010100005042771600300180300373003728286328767100102010000202000030037300371110021109101010000100000640216222962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  srsra v0.8h, v8.8h, #3
  movi v1.16b, 0
  srsra v1.8h, v8.8h, #3
  movi v2.16b, 0
  srsra v2.8h, v8.8h, #3
  movi v3.16b, 0
  srsra v3.8h, v8.8h, #3
  movi v4.16b, 0
  srsra v4.8h, v8.8h, #3
  movi v5.16b, 0
  srsra v5.8h, v8.8h, #3
  movi v6.16b, 0
  srsra v6.8h, v8.8h, #3
  movi v7.16b, 0
  srsra v7.8h, v8.8h, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602042008915002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010013111101191600200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000111101191600200621600001002006620066200662006620066
1602042006515602925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010010111101191600200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010010111101191600200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000111101191600200621600001002006620066200662006620066
160204200651500276258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100473111101191600200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000111101191600200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010010111101191600200621600001002006620066200662006620066
16020420065150069425801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000111101191600200621600001002006620066200662006620066
1602042006515062925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000111101191600200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)0304070a18191e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9accfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002420075150003000045258001010800001080000506400001200332005220052322800102080000201600002005220052111600211091010160000100000010042622029263221931200493115160000102004720047200472004720047
1600242004615000100016325800101080000108000050640000120033200522005232280010208000020160000200522005211160021109101016000010000001005862203226322192820049310160000102005320053200532005320053
1600242005215001000016325800101080000108000050640000120033200522005232280010208000020160000200522005211160021109101016000010000001005862203126322323220049310160000102005320053200532005320053
1600242005215001200006325800101080000108000050640000120033200522005232280010208000020160000200522005211160021109101016000010000001005431103120211293120043150160000102004720047200472004720053
1600242005315000100016325800101080000108000050640000120033200522005232280010208000020160000200522005211160021109101016000010000001004762213326322263320049310160000102005320053200532005320053
1600242005315010200006325800101080000108000050640000120033200522005232280010208000020160000200522005211160021109101016000010000001005762203427322323220049310160000102005320053200532005320053
1600242005315011100016325800101080000108000050640000120033200522005232280010208000020160000200522005211160021109101016000010000001006162213426322332620049310160000102005320053200532005320053
160024200521501020034805125800101080000108000050640000120033200522005232280010208000020160000200522005211160021109101016000010000001005862213526322333420049310160000102005320053200532005320053
1600242005215011100005125800101080000108000050640000120033200522005032280010208000020160000200522005211160021109101016000010000001005862203326322343320049310160000102005320053200532005320053
1600242005215011100005125800101080000108000050640000120033200522005232280010208000020160000200522005211160021109101016000010000001005262213026322323420049310160000102005320053200532005320053

Test 5: throughput

Count: 16

Code:

  srsra v0.8h, v16.8h, #3
  srsra v1.8h, v16.8h, #3
  srsra v2.8h, v16.8h, #3
  srsra v3.8h, v16.8h, #3
  srsra v4.8h, v16.8h, #3
  srsra v5.8h, v16.8h, #3
  srsra v6.8h, v16.8h, #3
  srsra v7.8h, v16.8h, #3
  srsra v8.8h, v16.8h, #3
  srsra v9.8h, v16.8h, #3
  srsra v10.8h, v16.8h, #3
  srsra v11.8h, v16.8h, #3
  srsra v12.8h, v16.8h, #3
  srsra v13.8h, v16.8h, #3
  srsra v14.8h, v16.8h, #3
  srsra v15.8h, v16.8h, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire (01)cycle (02)031e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9facc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204400592990030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010001111011811600400361600001004004040040400404004040040
16020440039300013230251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010001111011801600400361600001004004040040400404004040040
160204400393000030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010001111011801600400361600001004004040040400404004040040
160204400393000030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010001111011801600400361600001004004040040400404004040040
160204400392990030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010001111011801600400361600001004004040040400404004040088
160204400392990030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010001111011801600400361600001004004040040400404004040040
1602044003930000505251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010001111011801600400861600001004004040040400404004040040
160204400393000030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010001111011801600400361600001004004040040400404004040040
160204400393000064261601161001600161001600285001280196400294004940048199769199861601282001600382003200764004840049111602011009910010016000010002221012912311400451600001004004940049400504004940049
160204400483000064261601161001600161001600285001280196400294004840048199769199861601282001600382003200764004940048111602011009910010016000010002221012812311400461600001004004940049400494004940050

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acbranch mispredict (cb)cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024400393000462516001010160000101600005012800000140020040039400391999632001916001020160000203200004003940039111600211091010160000100001002231181621131040036155160000104004040040400404004040040
160024400393000217251600101016000010160000501280000114002004003940039199963200191600102016000020320280400394003911160021109101016000010000100223113162113940036155160000104004040040400404004040040
16002440039299046251600101016000010160000501280000114002004003940039199963200191600102016000020320000400394003911160021109101016000010000100223115162115940036155160000104004040040400404004040040
16002440039300046251600101016000010160000501280000114002004003940039199963200191600102016000020320000400394003911160021109101016000010000100223115162114740036155160000104004040040400404004040040
16002440039300046251600101016000010160000501280000114002004003940039199963200191600102016000020320000400394003911160021109101016000010000100223114162115740036155160000104004040040400404004040040
16002440039300046251600101016000010160000501280000114002004003940039199963200191600102016000020320000400394003911160021109101016000010000100223115162115840036155160000104004040040400404004040040
160024400393000167251600101016000010160000501280000114002004003940039199963200191600102016000020320000400394003911160021109101016000010000100223113162116740036155160000104004040040400404004040040
160024400393000462516001010160000101600005012800001140020040039400391999632001916001020160000203200004003940039111600211091010160000100011002231131621151140036155160000104004040040400404004040040
16002440039300046251600101016000010160000501280000114002004003940039199963200191600102016000020320000400394003911160021109101016000010000100223114162115640036155160000104004040040400404004040040
16002440039300046251600101016000010160000501280000114002004003940039199963200191600102016000020320000400394003911160021109101016000010000100223116162115640036155160000104004040040400404004040040