Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SRI (vector, 8H)

Test 1: uops

Code:

  sri v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a6a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10042037161561168725100010001000264680020182037203715723189510001000200020372037111001100001373116111787100020382038203820382038
1004203717061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203717061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203716061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203716061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203716061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203717061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203716061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203717061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203716216116872510001000100026468002018203720371572318951000100020002037203711100110000015973116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  sri v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150000090611968725101001001000010010000500284768002001820037200371842203187451010020010000200200002003720037111020110099100100100001000000071011611197910100001002003820038200382003820038
102042003715000000024791963212010180125100601311076065928539810201982003720277184400291883510896200108302162131820275202286110201100991001001000010002709845802133111997029100001002027520230202672022920275
102042008515210668164402982196761601022013410072139107607062854095020198203212032118444033189071089822210332220216582032120321611020110099100100100001002000071012511198270100001002027820181201832037220086
10204201221510133270146106019654801015411610036124104206652851529120090201342022918432017187981057321510168210200002027320180311020110099100100100001000000071023711197910100001002003820038200382037420038
10204200371611034399440222519687251010010010000100100005002847680020054200842013318429091876110424202103262062066020133201342110201100991001001000010022024023710125211979129100001002003820038200382003820227
10204201861571100001601968745101001001000010010000626284768002001820037200371842203187451010020010000200200002003720037111020110099100100100001000000371011611197910100001002003820038200382003820038
1020420037150000000611968725101001001000010010000500284768002001820037200371842203187451010020010000200200002003720037111020110099100100100001000000071011611197910100001002003820038200382003820038
1020420037150000000611968725101001001000010010000500284768002001820037200371842203187451010020010000200200002003720037111020110099100100100001000000371011611197910100001002003820038200382003820038
102042003715000004890611968725101001001000010010000500284768002001820037200371842203187451010020010000200200002003720037111020110099100100100001000000071011611197910100001002003820038200382003820038
1020420037150000000611968725101001001000010010000500284768002001820037200371842203187451010020010000200200002003720037111020110099100100100001000000071011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03081e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640316221978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184563187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001029640216221978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  sri v0.8h, v0.8h, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204200371500061196862510100100100001001000050028475211200182003720037184286187401010020010008200200162003720037111020110099100100100001000011171701600198010100001002003820038200382003820038
10204200371500084196862510100100100001001000050028475211200182003720037184286187401010020010008200200162003720037111020110099100100100001000011171801600198010100001002003820038200382003820038
10204200371500061196862510100100100121031015250028475210200182003720037184287187411010020010008200200162003720037111020110099100100100001002011171801600198000100001002003820038200382003820038
10204200371500061196862510100100100001001000050028475211200182003720037184286187411010020010008200200162003720037111020110099100100100001000011171801600198010100001002003820038200382003820038
10204200371500082196862510100100100001001000050028475211200182003720037184286187411010020010008200200162003720037111020110099100100100001000011171801600198010100001002003820038200382003820038
102042003715000126196862510100100100001001000050028475211200182003720037184286187411010020010008200200162003720037111020110099100100100001000011171801600198010100001002003820038200382003820038
10204200371500061196862510100100100001001000050028475210200182003720037184286187411010020010008200200162003720037111020110099100100100001000011171701600198000100001002003820038200382003820038
10204200371500061196862510100100100001001000050028475211200182003720037184287187411010020010008206200002003720037111020110099100100100001000011172222422197870100001002003820038200382003820038
10204200371500197196862510100100100001001000050028475211200182003720037184096187331010020010000200200002003720037111020110099100100100001000011172222422197870100001002003820038200382003820038
102042003715001118196862510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001000011172222422197870100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)0308090b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500000000022919686251001010100001010000502847521200182003720037184430318767100102010000202000020037200371110021109101010000100000006402162219786010000102003820038200382003820038
1002420037150000000006119686251001010100001010000502847521200182003720037184430318767100102010000202000020084200842110021109101010000100000006402162219786010000102003820038200382003820038
1002420037150000000006119686251001010100001010000502847521200182003720037184430318767100102010000202000020037200371110021109101010000100000006402162219786010000102003820038200382003820038
10024200371500000000022019686251001010100001010000502847521200182003720037184430318767100102010180202000020037200371110021109101010000100000006402162219786010000102003820038200382003820038
10024200371500000000010519686251001010100001010000502847521200182003720037184430318767100102010000202000020037200371110021109101010000100000006402162219786010000102003820038200382003820038
10024200371490000000014519686251001010100001010000502847521200182003720037184430318767100102010000202000020037200371110021109101010000100000006402162219786010000102003820038200382003820038
1002420037150000000006119686251001010100001010000502847521200182003720037184430318767100102010000202000020037200371110021109101010000100030006402162219786010000102003820038200382003820038
10024200371500000000014519686251001010100001010000502847521200182003720037184430318767100102010000202000020037200371110021109101010000100000006692162219786010000102003820038200382003820038
10024200371500000000021319686251001010100001010000502847521200182003720037184430318767100102010000202000020037200371110021109101010000100000006402162219786010000102003820038200382003820038
10024200371500000000095719686251001010100001010000502847521200182003720037184430318767100102010000202000020037200371110021109101010000100000606402162219786010000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sri v0.8h, v8.8h, #3
  movi v1.16b, 0
  sri v1.8h, v8.8h, #3
  movi v2.16b, 0
  sri v2.8h, v8.8h, #3
  movi v3.16b, 0
  sri v3.8h, v8.8h, #3
  movi v4.16b, 0
  sri v4.8h, v8.8h, #3
  movi v5.16b, 0
  sri v5.8h, v8.8h, #3
  movi v6.16b, 0
  sri v6.8h, v8.8h, #3
  movi v7.16b, 0
  sri v7.8h, v8.8h, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2510

retire (01)cycle (02)03080b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420080150000000712580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000000000011110119116002006201600001002006620066200662006620066
16020420065150000000292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000000000011110119016002006201600001002006620066200662006620066
16020420065151000000292580116100800161008002850064019612004420130200656128012820080028200160056200652006511160201100991001001600001000000000011110119116002006201600001002006620066200662006620066
16020420065151000000292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000000000011110119016002006201600001002006620066200662006620066
16020420065150000000292580116100800161008013150064019602004420065200656128012820080028200160056200652006511160201100991001001600001000000000011110131123112007401600001002007820077200772007820077
16020420131150000000642880119100800191008003150064022002005620076200769108013120080031200160062200762007611160201100991001001600001000000000022210146123112007301600001002007820078200782007720078
16020420077150000000642880119100800191008003150064022002005620076200769108013120080031200160062200772007711160201100991001001600001000000000022210130123112007401600001002007820077200772007820077
16020420076150000000642880119100800191008003150064022012005620076200779108013120080031200160062200762007611160201100991001001600001000000000022210130123112007401600001002007820077200772007820077
1602042007615000000012728801191008001910080031500640220020056200762007710108013120080031200160062200762007611160201100991001001600001000000000022210130123112007401600001002007820078200782007820077
160204200771500000001272880119100800191008003150064022002005620076200779108013120080031200160062200772007611160201100991001001600001000000000022210131123112007301600001002007720077200772007820077

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)030b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002420106150000003107278001010800001080000506400001120031200502005032180010208000020160000200502005011160021109101016000010469100463112534422242320047201160000102005120051200512005120051
160024200501500000034527800101080000108000050640000112004020050200503218001020800002016000020050200501116002110910101600001030100483122325211232320047201160000102005120051200512005120051
1600242005015000000147827800101080000108000050640000112003120050200503218001020800002016000020050200501116002110910101600001000100463112325211222520047201160000102005120051200602005120051
160024200501500000024427800101080000108000050640000112003120050200503218011120800002016000020050200501116002110910101600001010100473112525211272420047201160000102005120051200512005120051
160024200501500000014427800101080000108000050640000112003120050200503218001020800002016000020050200501116002110910101600001000100503112425211242420047201160000102005120051200512005120051
1600242005015000000136427800101080000108000050640000112003120050200503218001020800002016000020050200591116002110910101600001000100463112325211222320047201160000102005120051200512005120051
1600242005015000000110927800101080077108000050640000112003120050200503218001020800002016000020050200501116002110910101600001000100473112434412242420056402160000102005120051200512005120051
16002420050150000360110927800101080000108000050640000112003120050200503218001020800002016000020050200501116002110910101600001003100473112425211252420047201160000102005120051200512005120051
1600242005015000000213227800101080000108000050640000112003120050200503218001020800002016000020050200501116002110910101600001000100466122325211232320056201160000102006020051200602005120051
160024200501500000024427800101080000108000050640000212003120050200503218001020800002016000020050200501116002110910101600001000100473112434221272520056201160000102005120051200512005120051

Test 5: throughput

Count: 16

Code:

  sri v0.8h, v16.8h, #3
  sri v1.8h, v16.8h, #3
  sri v2.8h, v16.8h, #3
  sri v3.8h, v16.8h, #3
  sri v4.8h, v16.8h, #3
  sri v5.8h, v16.8h, #3
  sri v6.8h, v16.8h, #3
  sri v7.8h, v16.8h, #3
  sri v8.8h, v16.8h, #3
  sri v9.8h, v16.8h, #3
  sri v10.8h, v16.8h, #3
  sri v11.8h, v16.8h, #3
  sri v12.8h, v16.8h, #3
  sri v13.8h, v16.8h, #3
  sri v14.8h, v16.8h, #3
  sri v15.8h, v16.8h, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)0308090b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5branch mispredict (cb)cdcfd6daddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020440057300000000011325160108100160008100160020500128013204001940038400381997706199891601202001600322003200644003840038111602011009910010016000010000011110118160004003501600001004003940039400394003940039
1602044003830000000002925160108100160008100160020500128013204001940038400381997706199891601202001600322003200644003840038111602011009910010016000010000011110118160004003501600001004003940039400394003940039
1602044003829900000002925160108100160008100160020500128013204001940038400381997706199891601202001600322003200644003840038111602011009910010016000010000011110118160004003501600001004003940039400394003940039
16020440038300000000025025160108100160008100160020500128013204001940038400381997706199891601202001600322003200644003840038111602011009910010016000010000011110118160004003501600001004003940039400394007640039
1602044003829900000002925160108100160008100160020500128013204001940038400381997706199891601202001600322003202724003840038111602011009910010016000010000011110118160004003501600001004003940039400394003940039
1602044003830000000302925160108100160008100160020500128013204001940038400381997706199891601202001600322003200644003840038111602011009910010016000010000011110118160004003501600001004003940039400394003940090
1602044003830000000002925160108100160008100160020500128013204001940038400381997706199891601202001600322003200644003840038111602011009910010016000010000011110118160004003501600001004003940039400394003940039
1602044003830000000002925160108100160008100160020500128013204001940038400381997706199891601202001600322003200644003840038111602011009910010016000010000011110118160004003501600001004003940039400394003940039
1602044003830000000002925160108100160008100160020500128013204001940038400381997706199891601202001600322003200644003840038111602011009910010016000010000011110118160004003501600001004003940039400394003940039
1602044003830000000002925160108100160008100160020500128013204001940038400381997706199891601202001600322003200644003840038111602011009910010016000010000011110118160004003501600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03071e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024400462990009925160010101600001016000050128000011400194003840038199963200181600102016000020320000400384003811160021109101016000010001002231128162112422400350208160000104003940039400394003940039
1600244003830000045251600101016000010160000501280000114001940038400381999632001816001020160000203200004003840038111600211091010160000100010022312211621221244003504016160000104003940039400394003940039
160024400383000004525160010101600001016000050128000011400194003840038199963200181600102016000020320000400384003811160021109101016000010001002231122162112624400350208160000104003940039400394003940039
1600244003830000017125160010101600001016000050128000001400194003840038199963200181600102016000020320000400384003811160021109101016000010001002231122162112222400350208160000104003940039400394003940039
160024400383000004525160010101600001016000050128000011400194003840038199963200181600102016000020320000400384003811160021109101016000010001002231123162112221400350208160000104003940039400394003940039
160024400382990004525160010101600001016000050128000011400984003840139199963200181600102016000020320000400384003811160021109101016000010001002231115162112121400350208160000104003940039400394003940039
16002440038300000553525160010101600001016000050128000011400194003840038199963200181600102016000020320000400384003811160021109101016000010001002231122162112319400350208160000104003940039400394003940039
1600244003830000118225160010101600001016000050128000011400194003840038199963200181600102016000020320000400384003811160021109101016000010101002231121162112321400350208160000104003940039400394003940039
1600244003830000012925160010101600001016010850128000011400194003840038199963200181600102016000020320000400384003811160021109101016000010001002261123162112520400350208160000104003940039400394003940039
160024400383000004550160103101600951016009850128077611401124015640155200137200471601092016000020320000400384003811160021109101016000010001002231124162112424400350208160000104003940039400394003940039