Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SLI (vector, 2D)

Test 1: uops

Code:

  sli v0.2d, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073216111787100020382038203820382038
100420371608416872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715246116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116211787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371596116872510001000100026468012018203720371572318951000100020002037203711100110007073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  sli v0.2d, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0eb? simd retires (ee)? int retires (ef)f5f6f7f8fd
102042003715007261196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197911100001002003820038200382003820038
102042003715001261196872510100100100001001000050028476800200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
1020420037150050782196872510100100100001001000050028489630200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
102042003715002161196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
1020420037150046561196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
10204200371500192441196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
10204200371500061196872510100100100001201000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
102042003715002461196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03181e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100242003714902706119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715103072619687251001010100001010000502847680120018200372003718444318767100102010000202000020037200371110021109101010000100030640216221978510000102003820038200382003820038
10024200371500006119687251001010100001010000502852812020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000120640216221978510000102003820038200382003820038
10024200371500006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
10024200371500006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
10024200371500006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
10024200371500006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000061196872510010101000010100005028476800200182003720037184443187671001020100002020000200372003711100211091010100001000120640216221978510000102003820038200382003820038
10024200371500006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  sli v0.2d, v0.2d, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102042003715006119686251010010010000100100005002847521120018200372003718428061874110100200100082002001620037200371110201100991001001000010001201117171600198000100001002003820038200382003820038
10204200371500611968625101001001000010010000500284752112001820037200371842806187411010020010008200200162003720037111020110099100100100001000121117181600198010100001002003820038200382003820038
1020420037150061196862510100100100001001000050028475211200182003720037184280718740101002001000820020016200372003711102011009910010010000100001117181600198010100001002003820038200382003820038
1020420037150082196862510100100100001001000050028475211200182003720037184280618741101002001000820020016200372003711102011009910010010000100001117171600198012100001002003820038200382003820038
10204200371502461196862510100100100001001000050028475211200182003720037184280618741101002001000820020016200372003711102011009910010010000100001117181600198010100001002003820038200382003820038
102042003715006119686251010010010000100100005002847521120018200372003718428061874110100200100082002001620037200371110201100991001001000010001531117181600198010100001002003820038200382003820038
1020420037150061196862510100100100001001000050028475211200182003720037184280618740101002001000820020016200372003711102011009910010010000100001117181600198000100001002003820038200382003820038
10204200371500611968625101001001000010010000500284752112001820037200371842806187411010020010008200200162003720037111020110099100100100001000151117181600198000100001002003820038200382003820038
1020420037150061196862510100100100001001000050028496931200182003720037184280618740101002001000820020016200372003711102011009910010010000100001117181600198010100001002003820038200382003820038
1020420037150061196862510100100100001001000050028475211200182003720037184280618740101002001000820020016200372003711102011009910010010000100001117171600198010100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)033f4e5051schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100242003715061196860251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100114640316331978610000102003820038200382003820038
1002420037150611968620021251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100138640316331978610000102003820038200382003820038
1002420037150611968620021251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100117640316331978610000102003820038200382003820038
10024200371506119686200212510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001003640316331978610000102003820038200382003820038
1002420037150611968602510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001000640316331978610000102003820038200382003820038
1002420037150611968602510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001016640316331978610000102003820038200382003820038
1002420037150611968602510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001000640316331978610000102003820038200382003820038
1002420037150611968602510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001000640316331978610000102003820038200382003820038
1002420037150611968602510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001000640316331978610000102003820038200382003820038
1002420037150611968602510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001000640316331978610000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sli v0.2d, v8.2d, #3
  movi v1.16b, 0
  sli v1.2d, v8.2d, #3
  movi v2.16b, 0
  sli v2.2d, v8.2d, #3
  movi v3.16b, 0
  sli v3.2d, v8.2d, #3
  movi v4.16b, 0
  sli v4.2d, v8.2d, #3
  movi v5.16b, 0
  sli v5.2d, v8.2d, #3
  movi v6.16b, 0
  sli v6.2d, v8.2d, #3
  movi v7.16b, 0
  sli v7.2d, v8.2d, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)033f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fst unit uop (a7)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420091150292580116100800161028002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001002801111011901600200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000301111011901600200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001000401111011901600200621600001002006620066200662006620066
16020420065151292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
160204200651512192580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001000101111011901600200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001000201111011901600200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001000501111011901600200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001000101111011901600200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001000101111011901600200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2507

retire uop (01)cycle (02)03mmu table walk instruction (07)193a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024200751500004429800101080000108000050640000012003120052200503218001020800002016000020050200501116002110910101600001000100463112025211231820047201160000102005120053200532026520053
1600242005215000044278001010800001080000506400001120031200522005032180010208000020160000200522005011160021109101016000010446100413111825211221820047201160000102005320051200512026020051
160024200501500104427800101080000108000050640000112003120050200503218001020800002016000020050200501116002110910101600001003100453112325211182220049211160000102005320051200512024920051
160024200521500004427800101080000108000050640000112004020050200503218001020800002016000020050200501116002110910101600001006100463412425211182220047201160000102005320053200512026720062
160024200501500005029800101080000108000050640000012003120050200503218001020800002016000020050200591116002110910101600001000100403111725211211720047201160000102005120051200512024720060
160024200611500004429800101080000108000050640000012004020059200593218001020800002016000020059200591116002110910101600001000100466522036422221820056402160000102006020060200602026120051
160024200501500004429800101080297108000050640000012004020059200593218001020800002016000020059200591116002110910101600001006100466222036422202020056402160000102006020060200602028020060
1600242005915000071527800101080000108000050640000112003120050200503218001020800002016000020052200521116002110910101600001060100436221734422162020049402160000102006020060200602025120060
160024200611510005029800101080000108000050640000012004020059200503218001020800002016000020059200591116002110910101600001006100436522136422171820058402160000102006020060200602026620060
160024200611500005029800101080000108000050640000012004020061200593218001020800002016000020061200591116002110910101600001000100446521934422212320058412160000102006020060200602026820062

Test 5: throughput

Count: 16

Code:

  sli v0.2d, v16.2d, #3
  sli v1.2d, v16.2d, #3
  sli v2.2d, v16.2d, #3
  sli v3.2d, v16.2d, #3
  sli v4.2d, v16.2d, #3
  sli v5.2d, v16.2d, #3
  sli v6.2d, v16.2d, #3
  sli v7.2d, v16.2d, #3
  sli v8.2d, v16.2d, #3
  sli v9.2d, v16.2d, #3
  sli v10.2d, v16.2d, #3
  sli v11.2d, v16.2d, #3
  sli v12.2d, v16.2d, #3
  sli v13.2d, v16.2d, #3
  sli v14.2d, v16.2d, #3
  sli v15.2d, v16.2d, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020440038300921925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000101111011816400351600001004003940039400394003940039
16020440038300071251601081001600081001600205001280132140019400384003819977619989160120200160032200320064400384003811160201100991001001600001000002561111011816400351600001004003940039400394003940039
160204400383001271251601081001601081001600205001280132140019400384003819977619989160120200160032200320064400384008611160201100991001001600001002205001111011816400351600001004003940039400394003940039
16020440038300029251601081001600081001600205001280132140019400384003819977619989160120200160032200320064400384003811160201100991001001600001000001031111011816400351600001004003940039400394003940039
1602044003830002925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000801111011816400351600001004003940039400394003940039
16020440038300217125160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000001111011816400351600001004003940039400394003940039
16020440038300029251601081001600081001600205001280132140019400384003819977619989160120200160032200320064400384003811160201100991001001600001000005901111011816400351600001004003940039400394003940039
1602044003830002925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000001111011816400351600001004003940039400394003940039
1602044003829902925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000101111011816400351600001004003940039400394003940039
1602044003829902925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000101111011816400351600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk data (08)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002440046300004525160010101600001016000050128000011400194003840038199960320018160010201601322032000040038400381116002110910101600001000000100256211216422910400354016160000104003940039400394003940039
1600244003829900452516001010160000101600005012800001140019400384003819996032001816001020160000203200004003840038111600211091010160000100000010025622916422107400354016160000104003940039400394003940039
160024400383000045251600101016000010160000501280000104001940038400381999603200181600102016000020320000400384003811160021109101016000010000001002231151621151040035208160000104003940039400394003940039
160024400382990051251600101016000010160000501280000114001940038400381999603200181600102016000020320000400384003811160021109101016000010200001002532151622110540035408160000104003940039400394003940039
16002440038300004525160010101600001016000050128000011400194003840038199960320018160010201600002032000040038400381116002110910101600001000000100253121016412105400352016160000104003940039400394003940039
16002440038300004525160010101600001016000050128000000400194003840038199960320018160010201600002032000040038400381116002110910101600001000000100256221016422510400354016160000104003940039400394003940039
1600244009130000512516001010160000101600005012800001040019400384003819996032001816001020160000203200004003840038111600211091010160000100000010025312516412510400352016160000104003940039400394003940039
1600244003830000512516001010160000101600005012800001140019400384003819996032001816001020160000203200004003840038111600211091010160000100000010022311101621161140035208160000104003940039400394003940039
1600244009031300452516001010160000101600005012800001140019400384003819996032001816001020160000203200004003840038111600211091010160000100000910022311101621151040035208160000104003940039400394003940039
1600244003829900710251600101016000010160000501280000014001940038400381999603200181600102016000020320000400384003811160021109101016000010000001002231151621151040035208160000104003940039400394003940039