Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

XTN2 (2D)

Test 1: uops

Code:

  xtn2 v0.4s, v1.2d
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10042037160611687251000100010002646802018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037160611687251000100010002646802018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037160611687251000100010002646802018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037160611687251000100010002646802018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037150841687251000100010002646802018203720371572318951000100020002037203711100110000373116111787100020382038203820382038
10042037150611687251000100010002646802018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037160611687251000100010002646802018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037160611687251000100010002646802018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037150611687251000100010002646802018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037160821687251000100010002646802018203720371572318951000100020002037203711100110000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  xtn2 v0.4s, v1.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102042003715509657196872510164119100241131030465428515290200182008520085184223187451010020010000200200002013220085111020110099100100100001003063071011611197910100001002003820038200862003820038
102042003715513181031968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
1020420037155051611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
102042003715509611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
1020420037161024611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
102042003715600611968725101001001000010010000500284768002001820037201311842231876310100200100002002000020037200371110201100991001001000010000371011611197910100001002003820038200382003820038
102042003715500611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
102042003715503611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
1020420037155001031968725101001001000010010152500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
10204200371550243461968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002420037155000003930316196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000000006403163319785010000102003820038200382003820038
100242003715500000363089196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000000006403163319785010000102003820038200382003820038
100242003715500000315061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000000006403163319785010000102003820038200382003820038
100242003715500000336061196872510010101000010100005028476800200182003720037184443187671001020100002020000200372003711100211091010100001000000006403163319785010000102003820038200382003820038
100242003715500000342061196872510010101000010100005028476800200182003720037184443188401001020100002020000200372003711100211091010100001000000006403163319785010000102003820038200382003820038
100242003715500000306061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000000006403163319785010000102003820038200382003820038
10024200371550000093061196872510010101000010100005028476800200182003720037184443187671001020100002020000200372003711100211091010100001000000006403163319785010000102003820038200382003820038
10024200371550000000726196872510010101000010100005028476800200182003720037184443187671001020100002020000200372003711100211091010100001000000006403163319785010000102003820038200382003820038
100242003715600000192061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000000006403163319785010000102003820038200382003820038
1002420037155000000061196872510010101000010100005028476801200182003720037184443187671001022100002020000200372003711100211091010100001000000006403163319785010000102003820038200382003820038

Test 3: Latency 1->2

Code:

  xtn2 v0.4s, v0.2d
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)031e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204200371500006119686251010010010000100100005002847521200182003720037184286187401010020010008200200162003720037111020110099100100100001000001117180160019800100001002003820038200382003820038
1020420037150000611968625101001001000010010000500284752120018200372003718428618741101002001000820020016200372003711102011009910010010000100050781117170160019801100001002003820038200382003820038
10204200371500006119686251010010010000100100005002847521200182003720037184287187411010020010008200200162003720037111020110099100100100001000131117170160019800100001002003820038200382003820038
10204200371500006119686251010010010000100104565002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000001117170160019801100001002003820038200382003820038
102042003715000060719686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000261291117170160019800100001002003820086200382003820038
102042003715000061196862510100100100001001000050028475212001820037200371842871874110100200100082002001620037200371110201100991001001000010001121117170160019800100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475212001820037200371842861874010100200100082002001620037200371110201100991001001000010006331117180160019800100001002003820038200382003820038
10204200371500006119686251010010010000100100005002847521200182003720037184287187411010020010008200200162003720037111020110099100100100001000131117170160019801100001002003820038200382003820038
10204200371500006119686251010010010048100100005002847521200182003720037184287187411010020010008200200162003720037111020110099100100100001000061117180240019801100001002003820038200382003820038
1020420037150926406119686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000031117170160019800100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024200371550061196862510010101000010100005028475211200180200372003718443318767100102010000202000020037200371110021109101010000100010640416431978610000102003820038200382003820038
10024200371550061196862510010101000010100005028475211200180200372003718443318767100102010000202000020037200371110021109101010000100010640416331978610000102003820038200382003820038
10024200371550061196862510010101000010100005028475211200180200372003718443318767100102010000202000020037200371110021109101010000100000640316441978610000102003820038200382003820038
10024200371560061196862510010101000010100005028475211200180200372003718443318767100102010000202000020037200371110021109101010000100000640416341978610000102003820038200382003820038
10024200371550061196862510010101000010100005028475211200180200372003718443318767100102010000202000020037200371110021109101010000100000640416431978610000102003820038200382003820038
10024200371550061196862510010101000010100005028475211200180200372003718443318767100102010000202000020037200371110021109101010000100000640416341978610000102003820038200382003820038
10024200371560061196862510010101000010100005028475211200180200372003718443318767100102010000202000020037200371110021109101010000102000640416431978610000102003820038200382003820038
10024200371550061196862510010101000010100005028475211200180200372003718443318767100102010000202000020037200371110021109101010000100000640316341978610000102003820038200382003820038
10024200371550061196862510010101000010100005028475211200180200372003718443318767100102010000202000020037200371110021109101010000100000640416431978610000102003820038200382003820038
10024200371550061196862510010101000010100005028475211200180200372003718443318767100102010000202000020037200371110021109101010000100000640416231978610000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  xtn2 v0.4s, v8.2d
  movi v1.16b, 0
  xtn2 v1.4s, v8.2d
  movi v2.16b, 0
  xtn2 v2.4s, v8.2d
  movi v3.16b, 0
  xtn2 v3.4s, v8.2d
  movi v4.16b, 0
  xtn2 v4.4s, v8.2d
  movi v5.16b, 0
  xtn2 v5.4s, v8.2d
  movi v6.16b, 0
  xtn2 v6.4s, v8.2d
  movi v7.16b, 0
  xtn2 v7.4s, v8.2d
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk data (08)0918191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5e6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420090155000006942580116100800161008002850064019600020044200652006561280128200800282001600562006520065111602011009910010016000010000011110119001600200621600001002006620066200662006620066
1602042006515500000292580116100800161008002850064019600020044200652006561280128200800282001600562006520065111602011009910010016000010000011110119001700200621600001002006620066200662006620066
1602042006515500000292580192100800161008002850064019600020044200652006561280128200800282001600562006520065111602011009910010016000010000011110119001600200621600001002006620066200662006620066
1602042006515500000292580116100800161008002850064019600020044200652006561280128200800282001600562006520065111602011009910010016000010000011110119001600200621600001002006620066200662006620066
1602042006515600000292580116100800161008002850064019600020044200652006561280128200800282001600562006520065111602011009910010016000010000011110119001600200621600001002006620066200662006620066
16020420065156000012712580116100800161008002850064019600020044200652006561280128200800282001600562006520065111602011009910010016000010000011110119001600200621600001002006620066200662006620066
1602042006515600000292580116100800161008002850064019600020044200652006561280128200800282001600562006520065111602011009910010016000010000011110119001600200621600001002006620066200662006620066
1602042006515600000722580116100800161008002850064019600020044200652006561280128200800282001600562006520065111602011009910010016000010000011110119001600200621600001002006620066200662006620066
16020420065155000012292580116100800161008002850064019600020044200652006561280128200800282001600562006520065111602011009910010016000010000011110119001600200621600001002006620066200662006620066
1602042006515600000292580116100800161008002850064019600020044200652006561280128200800282001600562006520065111602011009910010016000010000011110119001600200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024200651560044258001010800001080000506400001002002620045200453218001020800002016000020045200451116002110910101600001000100308114202117620042030160000102005420046200462004620046
160024200451550044258001010800001080000506400001002002620045200453218001020800002016000020045200451116002110910101600001000100308116202114320042030160000102005020046200462004620046
1600242004515500442580010108000010800005064000010020026200452004532180010208000020160000200452004511160021109101016000010376100303114202114320042015160000102004620046200462004620046
160024200451550044258001010800001080000506400001052002620045200453218001020800002016000020045200451116002110910101600001010100313317232114620042315160000102005020046200462004620046
16002420045155001035258001010800001080000506400001152002620045200453218001020800002016000020045200451116002110910101600001020100263116202117420042015160000102004620046200462004620046
1600242004916100442580010108000010800005064000011520026200452004532180010208000020160000200452004611160021109101016000010410100293214202113420042015160000102004620046200462004620046
160024200451550044258001010800001080000506400001002002620045200453218001020800002016000020045200451116002110910101600001000100313114202114320042016160000102005020046200462004620046
1600242004515500987258001010800001080000506400001102002620045200453218001020800002016000020045200451116002110910101600001000100263113202113420042015160000102004620046200462004620046
160024200451560044258001010800001080000506400001002002620045200453218001020800002016000020045200451116002110910101600001000100283114204114420042015160000102005020046200462004620046
160024200451550044258001010800001080000506400001002002620045200453218001020800002016000020045200451116002110910101600001050100273114202118420042015160000102004620046200462004620046

Test 5: throughput

Count: 16

Code:

  xtn2 v0.4s, v16.2d
  xtn2 v1.4s, v16.2d
  xtn2 v2.4s, v16.2d
  xtn2 v3.4s, v16.2d
  xtn2 v4.4s, v16.2d
  xtn2 v5.4s, v16.2d
  xtn2 v6.4s, v16.2d
  xtn2 v7.4s, v16.2d
  xtn2 v8.4s, v16.2d
  xtn2 v9.4s, v16.2d
  xtn2 v10.4s, v16.2d
  xtn2 v11.4s, v16.2d
  xtn2 v12.4s, v16.2d
  xtn2 v13.4s, v16.2d
  xtn2 v14.4s, v16.2d
  xtn2 v15.4s, v16.2d
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2504

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020440074310000000105602516010810016000810016002350013201291400760400384009520012062018016012020016003220032006440038401001116020110099100100160000100000003011110118116114017901600001004003940344400394005840157
160204400953100000000016002516010810016000810016002350012801320400660400854007419998062003616012320016003220032006440099400991116020110099100100160000100000000011110118016104013301600001004003940096401424008640039
16020440038310000000002932516010810016000810016002050012801320400190400384009519977062006316012020016003220032006440038400951116020110099100100160000100000000011110118016004008201600001004005840157401574005840096
1602044009531000000078046602516012310016000810016002050012801320400760400384009519977061998916012020016003220032006440038400381116020110099100100160000100000000011110118016104003501600001004003940086400394008640039
16020440055310000000580301112516017810016000810016002050055983021400760400954009519978062010716012120016003220032006440095400381116020110099100100160000100000000011110118016004003501600001004009640039400964009640096
1602044009531000000000177802516018910016002310016002050054826161400760400384003820007062005216012020016003220032006440099400381116020110099100100160000100420000011110118116004046201600001004009640096400394003940096
1602044003831100000058018551112516010810016007810016002350015117241400660400744003819977061998916012320016003220032006440057400631116020110099100100160000100000000011110118016004009201600001004005840039400964003940086
160204400383110100001440193632516010910016000910016002050012801321400700400384014119977062002516012020016003220032006440085400381116020110099100100160000100000003011110118016104008201600001004003940039400864003940039
160204400383110000001280137002516010810016000810016002050012801320400190400394005719977061998916012020016003220032006440099400381116020110099100100160000100000000011110118116114003501600001004003940100400394009640096
1602044018231000000000177002516010810016007810016002050012801321400760400384005719977061999016012020016003220032006440039400731116020110099100100160000100000000011110118016014003501600001004004040039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk data (08)091e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024400743220000006702516006210160000101600005012800001140019400954009519996032001816001020160000203200004003840095111600211091010160000100000001002231116162111916400351550160000104003940039400394003940039
16002440038322000000731112516001010160000101600005012800001140019400384003819996032001816001020160000203200004003840095111600211091010160000100000001002231117162111921400541570160000104013840039400394003940039
1600244003931000000045025160010101600001016000050128000011400764009540039199960320075160010201600002032000040038400381116002110910101600001000000010022311171642216214003530100160000104003940154400964003940039
160024400383230000006302516001010160000101600005012800001140019400654007719996132001816001020160000203200004003840057111600211091010160000100000001002231114161111519400351560160000104005840039400404003940039
160024400383100000005102516001010160000101600005012800001140019400384003819996032001816001020160000203200004003940065111600211091010160000100000001002233120162111919400351550160000104003940040402084003940058
16002440038310000000451112516001110160000101600005012800001140019400384003819996032001816001020160000203200004005740057111600211091010160000100000001002233120162111923400351550160000104003940039400394003940039
1600244003831000300046025160010101600001016000050131999711400194003840038199960320095160010201600002032000040038400381116002110910101600001000000010024622201642219154015031120160000104003940058400394003940058
1600244003931000000051025160010101600001016000050128000001400194003840057199970320037160010201600002032000040038400381116002110910101600001000000010024622221642219214003530100160000104003940058400394005840039
16002440057310000000450251600101016000010160000501280000014001940038400381999603200181600102016000020320000400954003811160021109101016000010000000100246221816322192040035301042160000104003940058400394005840039
16002440057311011207805201041601581016000010160000505519418114017940095401001999603200181600102016000020320000400394020711160021109101016000010000000100243221816312192040054311400160000104003940058400404005840101