Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SLI (vector, 8H)

Test 1: uops

Code:

  sli v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004203715158216872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715010316872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  sli v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)0304081e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6c2cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150003576119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
10204200371500096119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007811161119791100001002003820038200382003820038
10204200371490006119687251010010010000100100005002848963020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
10204200371500006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
10204200371500006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
10204200371500006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
10204200371500008219687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100007571161119791100001002003820038200382003820038
10204200371500006119687251010010010000100101525002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
1020420037150001686119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
10204200371500008219687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002420037150000540611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
100242003715000000611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
1002420037150000540611968725100101010000101000050284768002001820037200371844431876710010201000020200002008420037111002110910101000010006402162219785010000102003820038200382003820038
1002420037150000001701968725100101010000101000050284768002001820037200371844431876710010201000020203342003720037111002110910101000010006402162219785010000102003820038200382003820038
10024200371500002880611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010036612162219785010000102003820038200382003820038
100242003715011221988611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010006402163219785010000102003820038200382003820074
10024200371500002580611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
1002420037150000002511968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
100242003715000000611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
100242003715000000611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038

Test 3: Latency 1->2

Code:

  sli v0.8h, v0.8h, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102042003715025506119686251011510010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001001117180160019800100001002003820038200382003820038
1020420037150006119686251010010010000100100005002847521200182003720037184287187411010020010008200200162003720037111020110099100100100001001117180160019801100001002003820038200382003820038
1020420037150006119686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001001117180160019800100001002003820038200382003820038
1020420037150006119686431010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001001117180160019801100001002003820038200382003820038
1020420037150006119686251010010010000100100005002847521200182003720037184416187401010020010008200200162003720037111020110099100100100001001117170160019801100001002003820038200382003820038
10204200371501806119686251010010010000100100005002847521200182003720037184286187401010020010008200200162003720037111020110099100100100001001117180160019800100001002003820038200382003820038
1020420037150606119686251010010010000100100005002847521200182003720037184287187411010020010008200200162003720037111020110099100100100001001117170160019800100001002003820038200382003820038
10204200371503306119686251010010010000100100005002847521200182003720037184287187401010020010008200200162003720037111020110099100100100001001117180160019800100001002003820038200382003820038
10204200371505106119686251010010010000100100005002847521200182003720037184286187401010020010008200200162003720037111020110099100100100001001117170160019801100001002003820038200382003820086
10204200371501206119686251010010010000100100005002847521200182003720037184286187401010020010008200200162003720037111020110099100100100001001117170160019801100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002420037150426119686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100640216221978610000102003820038200382003820038
1002420037150606119686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100640216221978610000102003820038200382003820038
1002420037150126119686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100640216221978610000102003820038200382003820038
1002420037150546119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100640216221978610000102003820038200382003820038
1002420037150816119686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100661216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120054200372003718443318767100102010000202000020037200371110021109101010000100640216221978610000102003820038200382003820038
1002420037150606119686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100640216221978610000102003820038200382003820038
100242003715096119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100640216221978610000102003820038200382003820038
1002420037150336119686251002210100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100640216221978610000102003820038200382003820038
100242003715006119686441001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100640216221978610000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sli v0.8h, v8.8h, #3
  movi v1.16b, 0
  sli v1.8h, v8.8h, #3
  movi v2.16b, 0
  sli v2.8h, v8.8h, #3
  movi v3.16b, 0
  sli v3.8h, v8.8h, #3
  movi v4.16b, 0
  sli v4.8h, v8.8h, #3
  movi v5.16b, 0
  sli v5.8h, v8.8h, #3
  movi v6.16b, 0
  sli v6.8h, v8.8h, #3
  movi v7.16b, 0
  sli v7.8h, v8.8h, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)030b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204200891500002102925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100000001111011911611200621600001002006620066200662006620066
160204200651500005702984801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100000001111011911611200621600001002006620066200662006620066
16020420065150000902925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100000001111011911611200621600001002006620066200662006620066
160204200651500001502925801161008001610080028500640196200442006520065612802402008002820016005620065200651116020110099100100160000100000001111011911611200621600001002006620066200662006620066
16020420065150000902925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100000001111011911611200621600001002006620066200662006620066
160204200651510001502925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100000001111011911611200621600001002006620066200662006620066
16020420065150000810292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010000048021111011911611200621600001002006620066200662006620066
1602042006515000024011525801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100000001111011911611200621600001002006620066200662006620066
160204200651500006024225801161008011710080028500641004200442006520065612801282008002820016005620065200651116020110099100100160000100000301111011911611200621600001002006620066200662006620066
160204201471500003841922925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100010001111011911611200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)030b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600242007315000033044278001010800001080000506400001152003120050200503218001020800002016000020050200501116002110910101600001000000000100318211125211102220047201160000102005120051200512005120051
160024200501500001804429800101080000108000050640000115200312005020050321800102080000201600002005020050111600211091010160000100000000010045821222542282220056201160000102006020060200602006020060
16002420050150000904427800101080000108000050640000115200312005020050321800102080000201600002005020050111600211091010160000100000000010032852925211222220047201160000102005120051200512005120051
1600242005015000000234278001010800001080000506400001152003120050200503218001020800002016000020050200591116002110910101600001000000000100438412125211212120047201160000102005120051200512005120051
1600242005015000018604429800101080000108000050640000015200312006820068321800102080000201600002005020050111600211091010160000100000000010045851212521121720047201160000102005120051200512005120051
160024200501500000044278001010800001080000506400001152003120050200503218001020800002016000020050200501116002110910101600001000000000100458612125211212120047201160000102005120051200512005120113
160024200501500006015927800101080000108000050640000115200312005020050321800102080000201600002005020050111600211091010160000100000000010045851212521118820047201160000102005120051200512005120051
160024200501510007204427800101080000108000050640000115200312005020050321800102080000201600002005020050111600211091010160000100000000010031851925211212120047201160000102005120051200512005120051
1600242005015000000442780010108000010800005064000011520031200502005032180010208000020160000200502005011160021109101016000010000000001003285192521172120047201160000102005120051200512005120051
160024200501510000044278001010800001080000506400001152003120050200503218001020800002016000020050200501116002110910101600001000000000100458512125211212120047201160000102005120051200512005120051

Test 5: throughput

Count: 16

Code:

  sli v0.8h, v16.8h, #3
  sli v1.8h, v16.8h, #3
  sli v2.8h, v16.8h, #3
  sli v3.8h, v16.8h, #3
  sli v4.8h, v16.8h, #3
  sli v5.8h, v16.8h, #3
  sli v6.8h, v16.8h, #3
  sli v7.8h, v16.8h, #3
  sli v8.8h, v16.8h, #3
  sli v9.8h, v16.8h, #3
  sli v10.8h, v16.8h, #3
  sli v11.8h, v16.8h, #3
  sli v12.8h, v16.8h, #3
  sli v13.8h, v16.8h, #3
  sli v14.8h, v16.8h, #3
  sli v15.8h, v16.8h, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd6e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204400612999292516010810016000810016002050012801321400190400384003819977619989160120200160032200320064400384003811160201100991001001600001006161111011816400351600001004003940039400394003940039
16020440038300029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100001111011816400351600001004003940039400394003940039
160204400382990504251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100001111011816400351600001004003940039400394003940039
16020440038301029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100001111011816400351600001004003940039400394003940039
16020440038300029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100001111011816400351600001004003940039400394003940039
1602044003830002925160108100160008100160020500128013214001904003840038199776199891601202001600322003200644003840038111602011009910010016000010001531111011816400351600001004003940039400394003940039
16020440038299029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100001111011816400351600001004003940039400394003940039
16020440038300029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100001111011816400351600001004003940039400394003940039
160204400383000504251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100001111011816400351600001004003940039400394003940039
16020440038299029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100002211011816400351600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03090b181e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2branch mispredict (cb)cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002440050300000006625160010101600001016000050128000011540019400384003819996320018160010201600002032000040038400381116002110910101600001000000000100228315162113440035208160000104003940039400394003940039
16002440038300000004525160010101600001016000050128000011540019400384003819996320018160010201600002032000040038400381116002110910101600001000000000100228416162113440035208160000104003940039400394003940039
16002440038300000004525160010101600001016000050128000011540019400384003819996320018160010201600002032000040038400381116002110910101600001000000000100228416162117640035208160000104003940039400394003940039
16002440038299000004525160010101600001016000050128000011540019400384003819996320018160010201600002032000040038400381116002110910101600001000000000100228414162116640035208160000104003940039400394003940039
16002440038300000004525160010101600001016000050128000011540019400384003819996320018160010201600002032000040038400381116002110910101600001000000000100228414162117640035208160000104003940039400394003940039
16002440038300000004525160010101600001016000050128000011540019400384003819996320018160010201600002032000040038400381116002110910101600001000000000100228414162117440035208160000104003940039400394003940039
16002440038300000004525160010101600001016000050128000011540019400384003819996320018160010201600002032000040038400381116002110910101600001000010000100228414162117740035208160000104003940039400394003940039
16002440038300000004525160010101600001016000050128000011540019400384003819996320018160010201600002032000040038400381116002110910101600001000000000100228514162113440035208160000104003940039400394003940039
16002440038300000004525160010101600001016000050128000011540019400384003819996320018160010201600002032000040038400381116002110910101600001000000000100228516162116440035208160000104003940039400394003940039
16002440038300000004525160010101600001016000050128000011540019400384003819996320018160010201600002032000040038400381116002110910101600001000000000100228516162116440035208160000104003940039400394003940039