Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SRSRA (vector, 8B)

Test 1: uops

Code:

  srsra v0.8b, v1.8b, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10043037230061254825100010001000398313030180303730372415328951000100020003037303711100110000073216222630100030383038303830383038
10043037230061254825100010001000398313030180303730372415328951000100020003037303711100110000073216222630100030383038303830383085
10043037240061254825100010001000398313030180303730372415328951000100020003037303711100110000073216222630100030383038303830383038
10043037220061254825100010001000398313030180303730372415328951000100020003037303711100110000073216222630100030383038303830383038
10043037230061254825100010001000398313030180303730372415328951000100020003037303711100110001073216222630100030383038303830383038
10043037230061254825100010001000398313030180303730372415328951000100020003037303711100110000073216222630100030383038303830383038
10043037220061254825100010001000398313030180303730372415328951000100020003037303711100110001073216222630100030383038303830383038
10043037230061254825100010001000398313030180303730372415328951000100020003037303711100110000073216222630100030383038303830383038
10043037230061254825100010001000398313030180303730372415328951000100020003037303711100110000073216222630100030383038303830383038
10043037230061254825100010001000398313030180303730372415328951000100020003037303711100110000073216222630100030383038303830383038

Test 2: Latency 1->1

Code:

  srsra v0.8b, v1.8b, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a9acc2cfd2d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102043003723200000086782954825101001001000010010000500427731303001830037300372826522289041225123011492230226483041930607131102011009910010010000100000002958028313296340100001003003830038300383003830038
10204300852320111813207046129440200102011511006416310298718428952603034230659303632826532874510100200100002002000030037300371110201100991001001000010040002523507100116122999340100001003047030468305613069330470
10204305132361000120393329512831016410010024134107456764282741030198302313022728283222883710874212108312142165630274302265110201100991001001000010000242190508230256212974119100001003022730180302303018130234
1020430178282104467235266642950318810199150100561261044768642800270301263013630178282741928800104232081016720820334301813013331102011009910010010000100010030710011611296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000000710011611296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000000710011611296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000000710011611296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000000710011611296340100001003003830038300383003830038
102043003723800000018329548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000000710011611296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000000710011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0308090b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024300372250000000612954825100101010000101000050427731313001830037300372828732876710010201000020200003003730037111002110910101000010000000006402162229630010000103003830038300383003830325
1002430367227011779396165039294851611006213100481610894664285455130270303663036728310322888411060221066122219383031930365811002110910101000010020022968207866724429882310000103036830369303723035930368
100243036822810168933176463229494161100691510048131089465428665113023430368303682831334289001105924111572221944302743032081100211091010100001020242192584076641054629882110000103036930368303703037330181
100243036722811177924440612954825100101010000101000050427731313001830037300372828732876710010201000020200003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
10024300372250000000612954825100101010000101000050427731313001830037300372828732876710010201000020200003003730037111002110910101000010000100006402162229630010000103003830038300383003830038
10024300372250000000612954825100101010000101000050427731313001830037300372828732876710010201000020200003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
10024300372250000000612954825100101010000101000050427731313001830037300372828732876710010201000020200003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
10024300372250000000612954825100101010000101000050427731313001830037300372828732876710010201000020200003003730037111002110910101000010003020006402162229630010000103003830038300383003830085
10024300372250000000612954825100101010000101000050427731313001830037300372828732876710010201000020200003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
10024300372240000000612954825100101010000101000050427731313001830037300372828732876710010201000020200003003730037111002110910101000010000000006402162229630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  srsra v0.8b, v0.8b, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a7a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204300372410000000276295472510100100100001001000050042771600300540300373003728271728741101002001017620620016300373003721102011009910010010000100000000011171801601296460100001003003830086300383003830038
1020430085241000000061295472510100100100001001000050042771600300180300373003728271728741101002001000820020016300373003711102011009910010010000100000000011172222422296290100001003003830038300383003830038
1020430037225000000197295472510100100100001001000050042771600300180300373003728252628733101002001000020020000300373003711102011009910010010000100000000011172222422296290100001003003830038300383003830038
1020430037225000000061295472510100100100001001000050042771600300180300373003728271628741101002001000820020016300373003711102011009910010010000100000000011171701600296450100001003003830038300383003830038
10204300372250000000147295472510100100100001001000050042771600300180300373003728252628733101002001000020020000300373003711102011009910010010000100000000011172222422297360100001003003830038300383003830038
102043003722500000013479295472510100100100001001000050042771600300180300373003728252628733101002001000020020000300373003711102011009910010010000100000000011171701600296450100001003003830038300383003830038
1020430037225000000061295472510100100100001001000050042771600300180300373003728271628741101002001000820020016300373003711102011009910010010000100000520011171801722296291100001003003830038300383003830038
102043003722500010001327295472510100100100001001000050042771600300180300373003728271728741101002001000820020016300373003711102011009910010010000100000000011171801600296450100001003003830038300383003830038
1020430037225000000061295472510100100100001001000050042771600300180300373003728271628741101002001000820020016300373003711102011009910010010000100000000011171801600296452100001003003830038300383003830038
10204300372250000180061295472510100100100001001000050042771600300180300373003728271628740101002001000820020016300373003711102011009910010010000100000000011171701600296460100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)030b181e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002430037225000002732954725100101010000101000050427716003001830037300372828603287671001020100002020000300373003711100211091010100001004640316332962910000103003830038300383003830038
100243003722500000612954725100101010000101000050427716013001830037300372828603287671001020100002020000300373003711100211091010100001000640316332962910000103003830038300383003830038
100243003722500000612954725100101010000101000050427716003001830037300372828603287671001020100002020000300373003711100211091010100001000640316332962910000103003830038300383003830038
1002430037225000001452954725100101010000101000050427716003001830037300372828603287671001020100002020000300373003711100211091010100001000640316332962910000103003830038300383003830038
1002430037225000001452954725100101010000101000050427716003001830037300372828603287671001020100002020000300373003711100211091010100001000640316332962910000103003830038300383003830038
1002430037225000004402954725100101010000101000050427716003001830037300372828603287671001020100002020000300373003711100211091010100001000640316332962910000103003830038300383003830038
1002430037225000004682954725100101010000101000050427716003001830037300372828603287671001020100002020000300373003711100211091010100001000640316332962910000103003830038300383003830038
100243003722500000612954725100101010000101000050427716003001830037300372828603287851001020100002020000300373003711100211091010100001000640316332962910000103003830038300383003830038
100243003722500000612954725100101010000101000050427716003001830037300372828603287671001020100002020000300373003711100211091010100001005640316332962910000103003830038300383003830038
100243003722500000612954725100101010000101000050427716003001830037300372828603287671001020100002020000300373003711100211091010100001000640316332962910000103003830038300863003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  srsra v0.8b, v8.8b, #3
  movi v1.16b, 0
  srsra v1.8b, v8.8b, #3
  movi v2.16b, 0
  srsra v2.8b, v8.8b, #3
  movi v3.16b, 0
  srsra v3.8b, v8.8b, #3
  movi v4.16b, 0
  srsra v4.8b, v8.8b, #3
  movi v5.16b, 0
  srsra v5.8b, v8.8b, #3
  movi v6.16b, 0
  srsra v6.8b, v8.8b, #3
  movi v7.16b, 0
  srsra v7.8b, v8.8b, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)030b1e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420065151002925801161008001610080028500640196120045020065200656128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
160204200651500122925801161008012110080028500640196020045020158200656338023420080028200160056200652006511160201100991001001600001002001111011901600200621600001002006620066200662006620066
1602042006515001831025801161008001610080028500640196120045020065200656128012820080450200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
16020420065151002925801161008001610080028500640196120045020065200656128012820080028200160056200652006511160201100991001001600001000301111011901600200621600001002006620066200662006620066
1602042006515002972925801161008001610080028500640196020045020065200656128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
160204200651500122925801161008001610080028500640196020045020065200656128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
16020420065150002925801161008001610080028500640196020045020065200656128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
16020420065150002925801161008001610080028500640196120045020065200656128012820080136200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
16020420065150002925801161008001610080028500640196020045020065200656128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
160204200651500422925801161008001610080028500640196020045020065200656128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accdcfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600242012615001562580010108000010800005064000011020027200502004632280010208000020160000201812004611160021109101016000010000100313111020321158200431500160000102004720047200472004720047
16002420046150045258001010800001080000506400001102002720046200463228001020800002016000020046200461116002110910101600001000010031311820312158200431500160000102004720047200472004720047
16002420046150045258001010800001080000506400001102002720046200463228001020800002016000020046200461116002110910101600001000010028311720272185200431500160000102004720047200472004720047
16002420046150045258001010800001080000506400001102002720046200463228001020800002016000020046200461116002110910101600001000010031311520352198200431500160000102004720047200472004720047
160024200461500325258001010800001080000506400001102002720046200463228001020800002016000020046200461116002110910101600001000010031311820272158200431500160000102004720047200472004720047
160024200461500452580010108000010800005064000011020027200462004632280010208000020160000200462004611160021109101016000010000100586229122212285200471500160000102005120047200472005120051
160024200501510114258001010800001080000506400000102003120050200503228001020800002016000020046200501116002110910101600001000010034622824232298200471500160000102004720047200512004720047
16002420050150051258001010800001080000506400000102003120046200503228001020800002016000020050200501116002110910101600001000010031311520312188200431500160000102004720047200472004720047
160024200461610213258001010800001080000506400001102002720046200463228001020800002016000020046201271116002110910101600001020010031311520311158200431500160000102004720047200472004720047
16002420046150045258001010800001080000506400001102002720046200463228001020800002016000020046200461116002110910101600001000010031311820291185200431500160000102004720047200472004720047

Test 5: throughput

Count: 16

Code:

  srsra v0.8b, v16.8b, #3
  srsra v1.8b, v16.8b, #3
  srsra v2.8b, v16.8b, #3
  srsra v3.8b, v16.8b, #3
  srsra v4.8b, v16.8b, #3
  srsra v5.8b, v16.8b, #3
  srsra v6.8b, v16.8b, #3
  srsra v7.8b, v16.8b, #3
  srsra v8.8b, v16.8b, #3
  srsra v9.8b, v16.8b, #3
  srsra v10.8b, v16.8b, #3
  srsra v11.8b, v16.8b, #3
  srsra v12.8b, v16.8b, #3
  srsra v13.8b, v16.8b, #3
  srsra v14.8b, v16.8b, #3
  srsra v15.8b, v16.8b, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204400583000302516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000011110118016004003601600001004004040040400404004040040
160204400393000302516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000011110118016004003601600001004004040040400404004040040
160204400393000302516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000011110118016004003601600001004004040040400404004040040
160204400392990302516010810016000810016002062012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000011110118016004003601600001004004040040400404004040040
160204400393000302516010810016000810016002050012801321400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000011110118016004003601600001004004040040400404004040040
160204400393000302516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000011110118016004003601600001004004040040400404004040040
160204400393000302516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000011110118016004003601600001004004040040400404004040040
160204400393000302516010810016000810016002050012801320400694003940039199776199901601202001600322003200644003940039111602011009910010016000010000011110118016004003601600001004004040040400404004040040
160204400393000302516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000011110118016004003601600001004004040040400404004040040
160204400393000302516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000011110118016004003601600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)0318191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024400393000000462516001010160000101600005012800001140020040039400391999632001916001020160000203200004003940039111600211091010160000104202125021002231120162111021401510206160000104004040040400404004040040
16002440039300000067251600101016000010160000501280000114002004003940039199963200191600102016000020320000400394003911160021109101016000010000000100223119162122110400360206160000104004040040400404004040040
160024400393000000462516001010160000101600005012800001140020040039400391999632001916001020160000203200004003940039111600211091010160000100000001002231120162112020400360206160000104004040040400404004040040
160024400393000000462516001010160000101600005012800001140020040039400391999632001916001020160000203200004003940039111600211091010160000100020001002231118162112020400360206160000104004040040400404004040040
160024400393000000462516001010160000101600005012800001140020040039400391999632001916001020160000203200004003940039111600211091010160000100000001002231120162112020400360206160000104004040040400404004040040
16002440039300000071125160010101600001016000050128000011400200400394003919996320019160010201600002032000040039400391116002110910101600001000000010022311916211820400360206160000104004040040400404004040040
1600244003930000002362516001010160000101600005012800001140020040039400391999632001916001020160000203200004003940039111600211091010160000100000001002231120162112020400360206160000104004040040400404004040040
160024400393000030462516001010160098101600005012800001140020040039400391999632001916001020160000203200004003940039111600211091010160000100000001002231120162112020400360206160000104004040040400404004040040
16002440039300000046251600101016000010160000501280000114006204003940039199963200191600102016000020320000400394003911160021109101016000010000000100223112016211920400360206160000104004040040400404004040040
160024400393000000462516001010160000101600005012800001140020040039400391999632001916001020160000203200004003940039111600211091010160000100000001002231120162112020400360206160000104004040040400404004040040