Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SRI (vector, 2S)

Test 1: uops

Code:

  sri v0.2s, v1.2s, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
100420371606116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371508216872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715636116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715012816872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110002073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  sri v0.2s, v1.2s, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204200371500000006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071021622197910100001002003820038200382003820038
102042003715000000014719687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071021622197910100001002003820038200382003820038
10204200371500000006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071021622197910100001002003820038200382003820038
102042003715000000010319687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100001030071021622197910100001002003820038200382003820038
10204200371500100006119687251010010010000100100005002847680120054200372003718422318745101002001000020020000200372003711102011009910010010000100000000071021622197910100001002003820038200382003820038
10204200371500000006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071021622197910100001002003820038200382003820038
10204200371500000006119687251010010010000100100005002847680020018200372003718422318745102522001000020020000200372003711102011009910010010000100000000071021622197910100001002003820038200382003820038
10204200371490000006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071021622197910100001002003820038200382003820038
10204200371500000008419687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071021622197910100001002003820038200382003820038
10204200371500000006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071021622197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002420037150061196872510010101000010100005028476802001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476802001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476802001820037200371844431876710010201017220200002003720037111002110910101000010000640216221978510000102003820038200382003820038
1002420037150061196872510010101000010100765028476802001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476802001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
10024200371500631196872510010101000010100005028476802001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476802001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476802001820037200371844431876710010201000020200002003720037111002110910101000010000640216221993110000102003820038200382003820038
1002420037150061196872510010101000010100005028476802001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476802001820037200371844431876710010201000020200002003720037111002110910101000010002640216221978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  sri v0.2s, v0.2s, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03070a1e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102042003715011006119686251010010010000100100005002847521020018200372003718428718741101002001000820020016200372003711102011009910010010000100001117181161119804100001002003820038200382003820038
102042003715011006119686251010010010000100100005002847521020018200372003718428618740101002001000820020016200372003711102011009910010010000100001117171161119804100001002003820038200382003820038
102042003715011006119686251010010010000100100005002847521020018200372003718428618741101002001000820020016200372003711102011009910010010000100001117171161119805100001002003820038200382003820038
102042003715011006119686251010010010000100100005002847521020018200372003718428618740101002001000820020016200372003711102011009910010010000100001117171161119804100001002003820038200382003820038
102042003715011006119686251010010010000100100005002847521020018200372003718428618740101002001000820020016200372003711102011009910010010000100001117171161119804100001002003820038200382003820038
102042003715011006119686251010010010000100100005002847521020018200372003718428618741101002001000820020016200372003711102011009910010010000100001117171161119805100001002003820038200382003820038
102042003715011006119686251010010010000100100005002847521120018200372003718428718741101002001000820020016200372003711102011009910010010000100001117171161119804100001002003820038200382003820038
102042003715011006119686251010010010000100100005002847521020018200372003718428618740101002001000820020016200372003711102011009910010010000100001117181161119805100001002003820038200382003820038
1020420037150110036719686251010010010000100100005002847521120018200372003718428718741101002001000820020016200372003711102011009910010010000100001117181161119804100001002003820038200382003820038
102042003715011006119686251010010010000100100005002847521020018200372003718428718741101002001000820020016200372003711102011009910010010000100001117181161119804100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)0308090b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500000000061196862510010101000010100005028475211200182003720037184433187671001020100002020326200372013121100211091010100001000000006402162219786010000102003820038200382003820038
100242003715000000000611968625100101010000101000050284752112001820037201321844311187671001020100002020000200372003721100211091010100001000000006402162219786010000102003820038200382003820038
10024200371500000000061196862510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001000000006402162219786010000102003820038200382003820038
100242003715000000000103196862510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001000000006402162219786010000102003820038200382003820038
1002420037150000000002500196291211008610100721210760612853841020234203142027218456291885910924221097324216862008520322711002110910101000010220001172847464642320002010000102027620309203172032220325
1002420227152011677955281305719620140101001010060101083250285509212023420215203191846027188831046824110162020000200372003711100211091010100001000002007043163419966010000102027520262202752027820134
10024203211510105566044002489196311431008413100721410912502855105020090203192027418464141887810826241100420206762032520321611002110910101000010000101017647683643320003110000102032120273203282027520311
100242032215200166678528135381968625100101010000101000050284752112001820037200371844331876710010201000020200002003720037111002110910101000010000001194806822162219894010000102003820038202762027420320
1002420309152100768015280343519620183100961210036141091272285636912023420372203691846928189271107620111722422344203692035981100211091010100001000201606402323220042010000102041820409203692032220371
1002420133150001324143520611968625100101010000101000050284752112001820037200371844312188041122822113512021002204162036881100211091010100001000200026882162219786010000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sri v0.2s, v8.2s, #3
  movi v1.16b, 0
  sri v1.2s, v8.2s, #3
  movi v2.16b, 0
  sri v2.2s, v8.2s, #3
  movi v3.16b, 0
  sri v3.2s, v8.2s, #3
  movi v4.16b, 0
  sri v4.2s, v8.2s, #3
  movi v5.16b, 0
  sri v5.2s, v8.2s, #3
  movi v6.16b, 0
  sri v6.2s, v8.2s, #3
  movi v7.16b, 0
  sri v7.2s, v8.2s, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2509

retire (01)cycle (02)03080918191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602042008815000000007125801161008001610080028500640196120044200652006561280128200800282001600562006520065111602011009910010016000010000000011110119016002006201600001002006620066200662006620066
1602042006515000000002925801161008001610080028500640196120044200652006561280128200800282001600562006520065111602011009910010016000010000000011110119016002006201600001002006620066200662006620066
16020420065150000000091125801161008001610080028500640196120044200652006561280128200800282001600562006520065111602011009910010016000010000000011110119016002006201600001002006620066200662006620066
1602042006515000000002925801161008001610080028500640196120044200652006561280128200800282001600562006520065111602011009910010016000010000020011110119016002006201600001002006620066200662030520066
1602042006515000000012925801161008001610080028500640196120044200652006561280128200800282001600562006520065111602011009910010016000010000000011110119016002006201600001002006620066200662006620066
1602042006515000000002925801161008001610080028500640196120044200652006561280128200800282001600562006520065111602011009910010016000010000010011110119084002006201600001002006620066200662006620066
1602042006515000000002925801161008001610080028500640196120044200652006563180128200800282001600562006520065111602011009910010016000010000000311110119016002006201600001002015620066200662006620147
160204200651500001132002928801191008001910080031500640220120056200762007691080131200800312001600622007620077111602011009910010016000010000000022210131123112007401600001002007720077200772007820077
1602042007615100000006428801191008001910080031500640220120056200772007691080131200800312001600622007620077111602011009910010016000010000000022210131123112007301600001002007720077200782007720077
1602042007715000000006428801191008001910080031500640220020056200772007791080131200800312001600622007620077111602011009910010016000010000000022210130123112007401600001002007720077200782007820078

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)03081e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002420102150034427800101080000108000050640000105200312005020050321802112080000201600002005920059111600211091010160000100000100343322025211112120047201160000102005120051200512005120051
1600242005015000442780010108000010800005064000011520031200502005032180010208000020160000200592005911160021109101016000010000210044811825211201020047402160000102005120060200602006020060
16002420050150004429800101080000108000050640000015200402005920059321800102080000201600002005920059111600211091010160000100000100448412025211112120047201160000102006020060200602005120051
16002420050150005029800101080000108000050640000015200312005920059321800102080000201600002005920059111600211091010160000100000100443412025211112120047202160000102006020060200602005120051
16002420050151004427800101080000108000050640000115200402005020050321800102080000201600002005920050111600211091010160000100000100448422025211212020047201160000102005120051200512005120051
1600242005015001244298001010800001080000506400000152003120050200501721800102080000201600002005020050111600211091010160000100100100448212025221102020047201160000102005120051200512005120051
1600242005015000442780010108000010800005064000011520031200502005032180010208000020160000200502005011160021109101016000010000010034841825211201020047201160000102005120051200512005120051
16002420050150004427800101080000108000050640000115200312005720050321800102080000201600002005020050111600211091010160000100460100448412025211212020047201160000102005120051200512005120051
16002420050150104427800101080000108000050640000115200402005920059321800102080000201600002005020050111600211091010160000100000100448412025211102020047402160000102005120051200512005120051
16002420059150004427800101080000108000050640000105200312005020050321800102080000201600002005020050111600211091010160000100030100443112025211202020047201160000102005120051200512005120051

Test 5: throughput

Count: 16

Code:

  sri v0.2s, v16.2s, #3
  sri v1.2s, v16.2s, #3
  sri v2.2s, v16.2s, #3
  sri v3.2s, v16.2s, #3
  sri v4.2s, v16.2s, #3
  sri v5.2s, v16.2s, #3
  sri v6.2s, v16.2s, #3
  sri v7.2s, v16.2s, #3
  sri v8.2s, v16.2s, #3
  sri v9.2s, v16.2s, #3
  sri v10.2s, v16.2s, #3
  sri v11.2s, v16.2s, #3
  sri v12.2s, v16.2s, #3
  sri v13.2s, v16.2s, #3
  sri v14.2s, v16.2s, #3
  sri v15.2s, v16.2s, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire (01)cycle (02)0307080a0b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020440057321000030000782516011210016020410016002850012801960400284004840048199776199891601202001600322003200644003840038111602011009910010016000010000000011110118016024003501600001004003940039400394003940039
160204400383220000000002332516010810016000810016002050012801321400284004840048199766199891601202001600322003200644003840038111602011009910010016000010020000011110118016004003501600001004003940039400394003940039
1602044003831000000030300292516010810016000810016002050012801320400194003840038199776199891601202001600322003202624003840038111602011009910010016000010000000011110168016004003501600001004003940039400394003940039
1602044003831000000000029251601081001600081001600205001280132040019401674008919977619989160120200160032200320064400384003811160201100991001001600001000000470011110118016104003501600001004003940039400394003940039
160204400383100001020001242516010810016000810016002050012801320400194003840038199776199891601202001601292003202644011140110311602011009910010016000010000000011110118016004003501600001004016240039400394003940048
16020440048322010000000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000011110118016004003501600001004003940039400394003940039
16020440038311000000000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000011110118057004003501600001004003940039400394003940039
16020440038310000001000642616011610016001610016002850012817240400284004840049199769199861601282001600382003200764004840048111602011009910010016000010002000022210129123114004501600001004004940050400494004940049
160204401713110010000006426160116100160016100160225500128019604002840049400481997610199861601282001600382003200764004840048111602011009910010016000010020000022210128123114004501600001004005040049400494005040049
16020440049311000000000642716011610016001610016002850012801960400884004840049199769199861601282001600382003200764004840048111602011009910010016000010022000022210129123114004501600001004021540049400494004940049

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)0318191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accdcfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600244003929900008092516001010160000101600005012800000140019040038400381999603200181600102016000020320000400384003811160021109101016000010000100223114162115540035155160000104003940039400394003940039
160024400382990000452516001010160000101600005012800000140019040038400381999603200181600102016000020320000400384003811160021109101016000010000100223114162113540035155160000104003940039400394003940039
160024400382990000452516001010160000101600005012800000140019040038400381999603200181600102016000020320000400384003811160021109101016000010030100223114162114440035155160000104003940039400394003940039
160024400383000000452516001010160000101600005012800001140019040038400381999603200181600102016000020320000400384003811160021109101016000010000100226114162114440035155160000104010040039400394003940039
1600244003830000004525160010101600001016000050128000011400190400384003819996732001816001020160000203200004003840038111600211091010160000100001002231151621145400351510160000104003940039400394003940039
160024400383000000742516001010160000101600005012800001140019040038400381999603200181600102016000020320000400384003811160021109101016000010000100223114162115540035305160000104003940039400394003940039
160024400383000000452516001010160000101600005012800000140019040038400381999603200181600102016000020320000400384003811160021109101016000010002100223114162114340035155160000104003940039400394003940039
160024400382990000452516001010160000101600005012800000140019040038400381999603200181600102016000020320000400384003811160021109101016000010200100223113164114440035155160000104003940039400394003940039
160024400383000000452516001010160000101600005012800000140019040038400381999503200181600102016000020320000400384003811160021109101016000010000100223114162114440035155160000104003940039400394003940039
160024400383000000452516001010160000101600005012800001140019040038400381999603200181600102016000020320000400384003811160021109101016000010000100223123162124340035155160000104003940039400394003940039