Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

XTN2 (4S)

Test 1: uops

Code:

  xtn2 v0.8h, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0ea? simd retires (ee)f5f6f7f8fd
10042037161561168725100010001000264680120182037203715723189510001000200020372037111001100000731161117870100020382038203820382038
1004203716061168725100010001000264680120182037203715723189510001000200020372037111001100000731161117870100020382038203820382038
10042037160103168725100010001000264680120182037203715723189510001000200020372037111001100000731161117870100020382038203820382038
1004203716361168725100010001000264680120182037203715723189510001000200020372037111001100000731161117870100020382038203820382038
1004203715061168725100010001000264680120182037203715723189510001000200020372037111001100000731381117870100020382038203820382038
1004203716061168725100010001000264680120182037203715723189510001000200020372037111001100000731161117870100020382038203820382038
1004203716061168725100010001000264680120182037203715723189510001000200020372037111001100000731161117870100020382038203820382038
1004203715061168725100010001000264680120182037203715723189510001000200020372037111001100000731161117870100020382038203820382038
10042037151261168725100010001000264680120182037203715723189510001000200020372037111001100000731161117870100020382038203820382038
10042037160104168725100010001000264680120182037203715723189510001000200020372037111001100000731161117870100020382038203820382038

Test 2: Latency 1->1

Code:

  xtn2 v0.8h, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204200371550024611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
1020420037156010611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
1020420037156000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
1020420037155000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
1020420037155000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
1020420037155000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
1020420037155000611968725101001001000010010000500284768012001820037200371842231874510270200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200862003820038
1020420037155000961968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000673411611197910100001002003820038200382003820038
1020420037156000611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
10204200371560006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100001571011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100242003715006119687251001010100001010000502847680120018200372003718444031876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
100242003715006119687251001010100001010000502847680120018200372003718444031876710010201000020200002003720071111002110910101000010000640216221978510000102003820038200382003820038
100242003715006119687251001010100001010000502847680020018200372003718444031876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
100242003715006119687251001010100001010000502847680120018200372003718444031876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
100242003715006119687251001010100001010000502847680020018200372003718444031876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768002001820037200371844403187671001020100002020000200372003711100211091010100001015100640216221978510000102003820038200382003820038
100242003715006119687251001010100001010000502847680120018200372003718444031876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
100242003715006119687251001010100001010000502847680120018200372003718444031876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
100242003715006119687251001010100001010000502847680120018200372003718444031876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
100242003715006119687251001010100001010000502847680120018200372003718444031876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  xtn2 v0.8h, v0.4s
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102042003715512061196862510100100100001001000050028475210200182003720037184286187311010020010008200200162003720037111020110099100100100001002961117170160019800100001002003820038200382003820038
1020420037155006119686251010010010000100100005002847521020018200372003718428718740101002001000820020016200372003711102011009910010010000100761117170160019800100001002003820038200382003820038
1020420037155006119686251010010010000100100005002847521020018200372003718428718741101002001000820020016200372003711102011009910010010000100861117170160019801100001002003820038200382003820038
1020420037155006119666251010010010000100100005002847521020018200372003718428718741101002001000820020016200372003711102011009910010010000100701117180160019800100001002003820038200382003820038
1020420037155006119686251010010010000100100005002847521020018200372003718428718740101002001000820020016200372003711102011009910010010000100101117170160019800100001002003820038200382003820038
10204200371550072619686251010010010000100100005002847521020018200372003718428618740101002001000820020016200372003711102011009910010010000100001117180160019800100001002003820038200382003820038
10204200371550061196862510100100100001001000050028475210200182003720037184286187411010020010008200200162003720037111020110099100100100001004031117180160019800100001002003820038200382003820038
10204200371553006119686251010010010000100100005002847521020018200372003718428718741101002001000820020016200372003711102011009910010010000100201117180160019800100001002003820038200382003820038
1020420037155006119686251010010010000100100005002847521020018200372003718428618740101002001000820020016200372003711102011009910010010000100031117170160019801100001002003820087200382003820038
1020420037156006119686251010010010000100100005002847521020018200372003718428618741101002001000820020016200372003711102011009910010010000100001117170160019800100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)abacc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024200371560007261968625100101010000101000050284752102001820037200371844331876710010201000020200002003720037111002110910101000010490752640216221978610000102003820038200382003820038
10024200851560006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
10024200371550006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
1002420037155001210319686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
10024200371550006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
100242022515500012819686251001010100001010000502848785020018200372003718443318785100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
10024200371600006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
1002420037155001289196862510010101000010100005028475210200182003720037184433187671001020100002020000200372003711100211091010100001000630640216221978610000102003820038200382003820038
10024200371550006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100060640216221978610000102003820038200382003820038
10024200371550012611968625100101010000101000050284752102001820037200371844331876710010201000020200002003720037111002110910101000010001620640216221978610000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  xtn2 v0.8h, v8.4s
  movi v1.16b, 0
  xtn2 v1.8h, v8.4s
  movi v2.16b, 0
  xtn2 v2.8h, v8.4s
  movi v3.16b, 0
  xtn2 v3.8h, v8.4s
  movi v4.16b, 0
  xtn2 v4.8h, v8.4s
  movi v5.16b, 0
  xtn2 v5.8h, v8.4s
  movi v6.16b, 0
  xtn2 v6.8h, v8.4s
  movi v7.16b, 0
  xtn2 v7.8h, v8.4s
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)dde0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420090156009292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001000000011110119161200621600001002006620066200662006620066
16020420065155000292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000000011110119160200621600001002006620066200662006620066
16020420065156000292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000000011110119160200621600001002006620066200662006620066
16020420065155000292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000000011110119160200621600001002006620066200662006620066
16020420065155000292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000010011110119160200621600001002006620066200662006620066
1602042006515500029258011610080016100800285006401961200442006520065612801282008002820016005620065200651116020110099100100160000100000011711110119160200621600001002006620066200662006620066
16020420065155000292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000000011110119160200621600001002006620066200662006620066
16020420065155000292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001000000011110119160200621600001002006620066200662006620066
16020420065156000712580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000000011110119160200621600001002006620066200662006620066
160204200651550092925801161008001610080028500640196020044200652006561280128200800282001600562006520065111602011009910010016000010000001211110119160200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420072155004425800101080000108000050640000110200260200452004532180010208000020160000200452004511160021109101016000010000001003331113326211111120042150160000102004620047202802005120046
16002420045155060862580010108000010800005064000010020026020045200453218001020800002016000020045200451116002110910101600001000000100358119202119720042150160000102004620046202172004620046
1600242004515500442580010108000010800005064000010020026020045200453218001020800002016000020045200451116002110910101600001000000100328119202119820043150160000102004620046202482004620046
1600242004515500862580010108000010800005064000010020026020045200463218001020800002016000020045200451116002110910101600001000030100328611020211111120042150160000102004620046202152004620046
1600242004515500442580010108000010800005064000010020026020045200453218001020800002016000020045200451116002110910101600001000003100333611020211111020042150160000102004620046203792028420046
160024200451550044258001010800001080000506400000102003002004920049321800102080000201600002004920049111600211091010160000100001310035112211244229820046300160000102005020050202472004620046
16002420049155004425800101080000108000050640000105200260200452004532180010208000020160000200452004911160021109101016000010000012100343121122411101020046150160000102004620046202332005024956
16002420045163004425800101080000108000050640000110200260200452004532180010208000020160000200452004511160021109101016000010000001003386192441110920042150160000102004620046205042005020050
160024200491560044258001010800001080000506400001052003002004920049321800102080000201600002004520045111600211091010160000100001010035862112021210820042150160000102004620046202312005020046
1600242004515500862580010108000010800005064000011520026020045200453218001020800002016000020045200451116002110910101600001002013100323111220211121220042150160000102004620046207412004620046

Test 5: throughput

Count: 16

Code:

  xtn2 v0.8h, v16.4s
  xtn2 v1.8h, v16.4s
  xtn2 v2.8h, v16.4s
  xtn2 v3.8h, v16.4s
  xtn2 v4.8h, v16.4s
  xtn2 v5.8h, v16.4s
  xtn2 v6.8h, v16.4s
  xtn2 v7.8h, v16.4s
  xtn2 v8.8h, v16.4s
  xtn2 v9.8h, v16.4s
  xtn2 v10.8h, v16.4s
  xtn2 v11.8h, v16.4s
  xtn2 v12.8h, v16.4s
  xtn2 v13.8h, v16.4s
  xtn2 v14.8h, v16.4s
  xtn2 v15.8h, v16.4s
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2505

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)191e373f4e5051schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602044013631110100055710251601081001600581001600205005519550154001940089400601997706199891601202001600322003200644003840038111602011009910010016000010000001111011851011611400921600001004003940086400754003940039
1602044003831110100785602251601581001600081001600205005440188154001940038400851999806199891601202001600322003200644009540038111602011009910010016000010000031111011850011611400351600001004005840039400964009640039
16020440095311101097850002516017810016000810016002050055195501040163400384003819977062004616012020016003220032126240095400391116020110099100100160000100014001111011800011611400541600001004006440039400964009640039
1602044009531010100056710251601081001600081001600235005440188104006640038400851999806200361601232001600322003200644003840057111602011009910010016000010001001111011800011611400351600001004008640039400404005840096
16020440095310101001561110251601081001600081001600205001280132104001940038400381997706199891601202001600322003200644009540038111602011009910010016000010000001111011800011611401381600001004018340040400394003940039
1602044003831110100235500251601081001600401001600205001320129104001940038401061997706199891601202001600322003200644003840057111602011009910010016000010000001111011800011611400701600001004003940096400964003940040
16020440038310101000561110251601081001600781001600205005519550104001940095400382000706199891601202001600322003200644003940057111602011009910010016000010000001111011800011611400351600001004008640039400864007540039
1602044008531110100782900251601081001600581001600235001280132004002040095400951997706199891601212001600322003217624003840057111602011009910010016000010000001111011800011611400921600001004009040064400394009640039
1602044003831010100585500251601401001600401001600215001280132104012240038400851999806199891601202001600322003200644006340038111602011009910010016000010000001111011800011611401381600001004003940100400394003940040
160204400383111010002900251601091001600081001600205005519550004007640100400382000706200461601202001600322003200644003840095111602011009910010016000010000001111011800011611400921600001004009640039400964009640096

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600244003931100000000744525160010101600001016000050128000011401344003940077199960320073160010201600002032000040095400381116002110910101600001000000000111742111071645234540035208160000104003940039400584013840039
160024400633100000000787302516001010160078101600005012800000140019400384003820000032001816001020160000203200004003840038111600211091010160000100000000010024312031644133640036208160000104009640096400394003940039
1600244003831100000120234502516001010160144101600005054803771140019400954003819996032001816001020160000203200004003940207111600211091010160000100000000010022311041623224440035208160000104003940039400964003940096
16002440038310000000027451112516009110160000101600005012800001140019400384003819996032001816001020160000203200004003840038111600211091010160000100000000010022311041623334440035208160000104003940039400394003940096
160024400773110000000787002516001010160000101600005055194181140134400774003819996032007516001020160000203200004003840038111600211091010160000100000000010022621061623529740150208160000104003940039400394003940039
160024400953100001113500451112516001010160149101600005055194181140019400774003819996032001816001020160000203200004003840038111600211091010160000100000000010024611071638417840092208160000104003940039401544010140039
160024400383110000000145025160010101600001016000050128000011400194005740039199960320037160010201600002032000040038400571116002110910101600001000000000100223110416323274400354016160000104003940096400394003940039
160024400953100000012081671112516001010160000101600005012800001140019400384012419996032001816001020160000203200004003940057111600211091010160000100000000010024611061638314540092208160000104003940040400404003940039
16002440038311000000005102516003310160078101600005012800000140118400574003819996032001816001020160000203200004003840143111600211091010160000100000430010024622031650517940035408160000104007840096400394003940039
16002440095311000000078902702516001010160000101600005012800001140076400954003820026032001816001020160000203200004015340095111600211091010160000100000000010024622031640514440092208160000104003940039400394009640096